From a5fb382a29991c8eafcb8c54cdd8c7aab260c237 Mon Sep 17 00:00:00 2001 From: Patrick Cloke Date: Mon, 20 Mar 2023 14:32:26 -0400 Subject: [PATCH] Separate HTTP preview code and URL previewer. (#15269) Separates REST layer code from the actual URL previewing. --- changelog.d/15269.misc | 1 + synapse/media/url_previewer.py | 833 +++++++++++++++++++++ synapse/rest/media/preview_url_resource.py | 796 +------------------- tests/rest/media/test_url_preview.py | 34 +- 4 files changed, 854 insertions(+), 810 deletions(-) create mode 100644 changelog.d/15269.misc create mode 100644 synapse/media/url_previewer.py diff --git a/changelog.d/15269.misc b/changelog.d/15269.misc new file mode 100644 index 000000000..b3126fb1f --- /dev/null +++ b/changelog.d/15269.misc @@ -0,0 +1 @@ +Reorganize URL preview code. diff --git a/synapse/media/url_previewer.py b/synapse/media/url_previewer.py new file mode 100644 index 000000000..c8a4a809f --- /dev/null +++ b/synapse/media/url_previewer.py @@ -0,0 +1,833 @@ +# Copyright 2016 OpenMarket Ltd +# Copyright 2020-2023 The Matrix.org Foundation C.I.C. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import datetime +import errno +import fnmatch +import logging +import os +import re +import shutil +import sys +import traceback +from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple +from urllib.parse import urljoin, urlparse, urlsplit +from urllib.request import urlopen + +import attr + +from twisted.internet.defer import Deferred +from twisted.internet.error import DNSLookupError + +from synapse.api.errors import Codes, SynapseError +from synapse.http.client import SimpleHttpClient +from synapse.logging.context import make_deferred_yieldable, run_in_background +from synapse.media._base import FileInfo, get_filename_from_headers +from synapse.media.media_storage import MediaStorage +from synapse.media.oembed import OEmbedProvider +from synapse.media.preview_html import decode_body, parse_html_to_open_graph +from synapse.metrics.background_process_metrics import run_as_background_process +from synapse.types import JsonDict, UserID +from synapse.util import json_encoder +from synapse.util.async_helpers import ObservableDeferred +from synapse.util.caches.expiringcache import ExpiringCache +from synapse.util.stringutils import random_string + +if TYPE_CHECKING: + from synapse.media.media_repository import MediaRepository + from synapse.server import HomeServer + +logger = logging.getLogger(__name__) + +OG_TAG_NAME_MAXLEN = 50 +OG_TAG_VALUE_MAXLEN = 1000 + +ONE_HOUR = 60 * 60 * 1000 +ONE_DAY = 24 * ONE_HOUR +IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY + + +@attr.s(slots=True, frozen=True, auto_attribs=True) +class DownloadResult: + length: int + uri: str + response_code: int + media_type: str + download_name: Optional[str] + expires: int + etag: Optional[str] + + +@attr.s(slots=True, frozen=True, auto_attribs=True) +class MediaInfo: + """ + Information parsed from downloading media being previewed. + """ + + # The Content-Type header of the response. + media_type: str + # The length (in bytes) of the downloaded media. + media_length: int + # The media filename, according to the server. This is parsed from the + # returned headers, if possible. + download_name: Optional[str] + # The time of the preview. + created_ts_ms: int + # Information from the media storage provider about where the file is stored + # on disk. + filesystem_id: str + filename: str + # The URI being previewed. + uri: str + # The HTTP response code. + response_code: int + # The timestamp (in milliseconds) of when this preview expires. + expires: int + # The ETag header of the response. + etag: Optional[str] + + +class UrlPreviewer: + """ + Generates an Open Graph (https://ogp.me/) responses (with some Matrix + specific additions) for a given URL. + + When Synapse is asked to preview a URL it does the following: + + 1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the + config). + 2. Checks the URL against an in-memory cache and returns the result if it exists. (This + is also used to de-duplicate processing of multiple in-flight requests at once.) + 3. Kicks off a background process to generate a preview: + 1. Checks URL and timestamp against the database cache and returns the result if it + has not expired and was successful (a 2xx return code). + 2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it + does, update the URL to download. + 3. Downloads the URL and stores it into a file via the media storage provider + and saves the local media metadata. + 4. If the media is an image: + 1. Generates thumbnails. + 2. Generates an Open Graph response based on image properties. + 5. If the media is HTML: + 1. Decodes the HTML via the stored file. + 2. Generates an Open Graph response from the HTML. + 3. If a JSON oEmbed URL was found in the HTML via autodiscovery: + 1. Downloads the URL and stores it into a file via the media storage provider + and saves the local media metadata. + 2. Convert the oEmbed response to an Open Graph response. + 3. Override any Open Graph data from the HTML with data from oEmbed. + 4. If an image exists in the Open Graph response: + 1. Downloads the URL and stores it into a file via the media storage + provider and saves the local media metadata. + 2. Generates thumbnails. + 3. Updates the Open Graph response based on image properties. + 6. If the media is JSON and an oEmbed URL was found: + 1. Convert the oEmbed response to an Open Graph response. + 2. If a thumbnail or image is in the oEmbed response: + 1. Downloads the URL and stores it into a file via the media storage + provider and saves the local media metadata. + 2. Generates thumbnails. + 3. Updates the Open Graph response based on image properties. + 7. Stores the result in the database cache. + 4. Returns the result. + + If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or + image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole + does not fail. As much information as possible is returned. + + The in-memory cache expires after 1 hour. + + Expired entries in the database cache (and their associated media files) are + deleted every 10 seconds. The default expiration time is 1 hour from download. + """ + + def __init__( + self, + hs: "HomeServer", + media_repo: "MediaRepository", + media_storage: MediaStorage, + ): + self.clock = hs.get_clock() + self.filepaths = media_repo.filepaths + self.max_spider_size = hs.config.media.max_spider_size + self.server_name = hs.hostname + self.store = hs.get_datastores().main + self.client = SimpleHttpClient( + hs, + treq_args={"browser_like_redirects": True}, + ip_whitelist=hs.config.media.url_preview_ip_range_whitelist, + ip_blacklist=hs.config.media.url_preview_ip_range_blacklist, + use_proxy=True, + ) + self.media_repo = media_repo + self.primary_base_path = media_repo.primary_base_path + self.media_storage = media_storage + + self._oembed = OEmbedProvider(hs) + + # We run the background jobs if we're the instance specified (or no + # instance is specified, where we assume there is only one instance + # serving media). + instance_running_jobs = hs.config.media.media_instance_running_background_jobs + self._worker_run_media_background_jobs = ( + instance_running_jobs is None + or instance_running_jobs == hs.get_instance_name() + ) + + self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist + self.url_preview_accept_language = hs.config.media.url_preview_accept_language + + # memory cache mapping urls to an ObservableDeferred returning + # JSON-encoded OG metadata + self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache( + cache_name="url_previews", + clock=self.clock, + # don't spider URLs more often than once an hour + expiry_ms=ONE_HOUR, + ) + + if self._worker_run_media_background_jobs: + self._cleaner_loop = self.clock.looping_call( + self._start_expire_url_cache_data, 10 * 1000 + ) + + async def preview(self, url: str, user: UserID, ts: int) -> bytes: + # XXX: we could move this into _do_preview if we wanted. + url_tuple = urlsplit(url) + for entry in self.url_preview_url_blacklist: + match = True + for attrib in entry: + pattern = entry[attrib] + value = getattr(url_tuple, attrib) + logger.debug( + "Matching attrib '%s' with value '%s' against pattern '%s'", + attrib, + value, + pattern, + ) + + if value is None: + match = False + continue + + # Some attributes might not be parsed as strings by urlsplit (such as the + # port, which is parsed as an int). Because we use match functions that + # expect strings, we want to make sure that's what we give them. + value_str = str(value) + + if pattern.startswith("^"): + if not re.match(pattern, value_str): + match = False + continue + else: + if not fnmatch.fnmatch(value_str, pattern): + match = False + continue + if match: + logger.warning("URL %s blocked by url_blacklist entry %s", url, entry) + raise SynapseError( + 403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN + ) + + # the in-memory cache: + # * ensures that only one request is active at a time + # * takes load off the DB for the thundering herds + # * also caches any failures (unlike the DB) so we don't keep + # requesting the same endpoint + + observable = self._cache.get(url) + + if not observable: + download = run_in_background(self._do_preview, url, user, ts) + observable = ObservableDeferred(download, consumeErrors=True) + self._cache[url] = observable + else: + logger.info("Returning cached response") + + return await make_deferred_yieldable(observable.observe()) + + async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: + """Check the db, and download the URL and build a preview + + Args: + url: The URL to preview. + user: The user requesting the preview. + ts: The timestamp requested for the preview. + + Returns: + json-encoded og data + """ + # check the URL cache in the DB (which will also provide us with + # historical previews, if we have any) + cache_result = await self.store.get_url_cache(url, ts) + if ( + cache_result + and cache_result["expires_ts"] > ts + and cache_result["response_code"] / 100 == 2 + ): + # It may be stored as text in the database, not as bytes (such as + # PostgreSQL). If so, encode it back before handing it on. + og = cache_result["og"] + if isinstance(og, str): + og = og.encode("utf8") + return og + + # If this URL can be accessed via oEmbed, use that instead. + url_to_download = url + oembed_url = self._oembed.get_oembed_url(url) + if oembed_url: + url_to_download = oembed_url + + media_info = await self._handle_url(url_to_download, user) + + logger.debug("got media_info of '%s'", media_info) + + # The number of milliseconds that the response should be considered valid. + expiration_ms = media_info.expires + author_name: Optional[str] = None + + if _is_media(media_info.media_type): + file_id = media_info.filesystem_id + dims = await self.media_repo._generate_thumbnails( + None, file_id, file_id, media_info.media_type, url_cache=True + ) + + og = { + "og:description": media_info.download_name, + "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}", + "og:image:type": media_info.media_type, + "matrix:image:size": media_info.media_length, + } + + if dims: + og["og:image:width"] = dims["width"] + og["og:image:height"] = dims["height"] + else: + logger.warning("Couldn't get dims for %s" % url) + + # define our OG response for this media + elif _is_html(media_info.media_type): + # TODO: somehow stop a big HTML tree from exploding synapse's RAM + + with open(media_info.filename, "rb") as file: + body = file.read() + + tree = decode_body(body, media_info.uri, media_info.media_type) + if tree is not None: + # Check if this HTML document points to oEmbed information and + # defer to that. + oembed_url = self._oembed.autodiscover_from_html(tree) + og_from_oembed: JsonDict = {} + if oembed_url: + try: + oembed_info = await self._handle_url( + oembed_url, user, allow_data_urls=True + ) + except Exception as e: + # Fetching the oEmbed info failed, don't block the entire URL preview. + logger.warning( + "oEmbed fetch failed during URL preview: %s errored with %s", + oembed_url, + e, + ) + else: + ( + og_from_oembed, + author_name, + expiration_ms, + ) = await self._handle_oembed_response( + url, oembed_info, expiration_ms + ) + + # Parse Open Graph information from the HTML in case the oEmbed + # response failed or is incomplete. + og_from_html = parse_html_to_open_graph(tree) + + # Compile the Open Graph response by using the scraped + # information from the HTML and overlaying any information + # from the oEmbed response. + og = {**og_from_html, **og_from_oembed} + + await self._precache_image_url(user, media_info, og) + else: + og = {} + + elif oembed_url: + # Handle the oEmbed information. + og, author_name, expiration_ms = await self._handle_oembed_response( + url, media_info, expiration_ms + ) + await self._precache_image_url(user, media_info, og) + + else: + logger.warning("Failed to find any OG data in %s", url) + og = {} + + # If we don't have a title but we have author_name, copy it as + # title + if not og.get("og:title") and author_name: + og["og:title"] = author_name + + # filter out any stupidly long values + keys_to_remove = [] + for k, v in og.items(): + # values can be numeric as well as strings, hence the cast to str + if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN: + logger.warning( + "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN] + ) + keys_to_remove.append(k) + for k in keys_to_remove: + del og[k] + + logger.debug("Calculated OG for %s as %s", url, og) + + jsonog = json_encoder.encode(og) + + # Cap the amount of time to consider a response valid. + expiration_ms = min(expiration_ms, ONE_DAY) + + # store OG in history-aware DB cache + await self.store.store_url_cache( + url, + media_info.response_code, + media_info.etag, + media_info.created_ts_ms + expiration_ms, + jsonog, + media_info.filesystem_id, + media_info.created_ts_ms, + ) + + return jsonog.encode("utf8") + + async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult: + """ + Fetches a remote URL and parses the headers. + + Args: + url: The URL to fetch. + output_stream: The stream to write the content to. + + Returns: + A tuple of: + Media length, URL downloaded, the HTTP response code, + the media type, the downloaded file name, the number of + milliseconds the result is valid for, the etag header. + """ + + try: + logger.debug("Trying to get preview for url '%s'", url) + length, headers, uri, code = await self.client.get_file( + url, + output_stream=output_stream, + max_size=self.max_spider_size, + headers={ + b"Accept-Language": self.url_preview_accept_language, + # Use a custom user agent for the preview because some sites will only return + # Open Graph metadata to crawler user agents. Omit the Synapse version + # string to avoid leaking information. + b"User-Agent": [ + "Synapse (bot; +https://github.com/matrix-org/synapse)" + ], + }, + is_allowed_content_type=_is_previewable, + ) + except SynapseError: + # Pass SynapseErrors through directly, so that the servlet + # handler will return a SynapseError to the client instead of + # blank data or a 500. + raise + except DNSLookupError: + # DNS lookup returned no results + # Note: This will also be the case if one of the resolved IP + # addresses is blacklisted + raise SynapseError( + 502, + "DNS resolution failure during URL preview generation", + Codes.UNKNOWN, + ) + except Exception as e: + # FIXME: pass through 404s and other error messages nicely + logger.warning("Error downloading %s: %r", url, e) + + raise SynapseError( + 500, + "Failed to download content: %s" + % (traceback.format_exception_only(sys.exc_info()[0], e),), + Codes.UNKNOWN, + ) + + if b"Content-Type" in headers: + media_type = headers[b"Content-Type"][0].decode("ascii") + else: + media_type = "application/octet-stream" + + download_name = get_filename_from_headers(headers) + + # FIXME: we should calculate a proper expiration based on the + # Cache-Control and Expire headers. But for now, assume 1 hour. + expires = ONE_HOUR + etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None + + return DownloadResult( + length, uri, code, media_type, download_name, expires, etag + ) + + async def _parse_data_url( + self, url: str, output_stream: BinaryIO + ) -> DownloadResult: + """ + Parses a data: URL. + + Args: + url: The URL to parse. + output_stream: The stream to write the content to. + + Returns: + A tuple of: + Media length, URL downloaded, the HTTP response code, + the media type, the downloaded file name, the number of + milliseconds the result is valid for, the etag header. + """ + + try: + logger.debug("Trying to parse data url '%s'", url) + with urlopen(url) as url_info: + # TODO Can this be more efficient. + output_stream.write(url_info.read()) + except Exception as e: + logger.warning("Error parsing data: URL %s: %r", url, e) + + raise SynapseError( + 500, + "Failed to parse data URL: %s" + % (traceback.format_exception_only(sys.exc_info()[0], e),), + Codes.UNKNOWN, + ) + + return DownloadResult( + # Read back the length that has been written. + length=output_stream.tell(), + uri=url, + # If it was parsed, consider this a 200 OK. + response_code=200, + # urlopen shoves the media-type from the data URL into the content type + # header object. + media_type=url_info.headers.get_content_type(), + # Some features are not supported by data: URLs. + download_name=None, + expires=ONE_HOUR, + etag=None, + ) + + async def _handle_url( + self, url: str, user: UserID, allow_data_urls: bool = False + ) -> MediaInfo: + """ + Fetches content from a URL and parses the result to generate a MediaInfo. + + It uses the media storage provider to persist the fetched content and + stores the mapping into the database. + + Args: + url: The URL to fetch. + user: The user who ahs requested this URL. + allow_data_urls: True if data URLs should be allowed. + + Returns: + A MediaInfo object describing the fetched content. + """ + + # TODO: we should probably honour robots.txt... except in practice + # we're most likely being explicitly triggered by a human rather than a + # bot, so are we really a robot? + + file_id = datetime.date.today().isoformat() + "_" + random_string(16) + + file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) + + with self.media_storage.store_into_file(file_info) as (f, fname, finish): + if url.startswith("data:"): + if not allow_data_urls: + raise SynapseError( + 500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN + ) + + download_result = await self._parse_data_url(url, f) + else: + download_result = await self._download_url(url, f) + + await finish() + + try: + time_now_ms = self.clock.time_msec() + + await self.store.store_local_media( + media_id=file_id, + media_type=download_result.media_type, + time_now_ms=time_now_ms, + upload_name=download_result.download_name, + media_length=download_result.length, + user_id=user, + url_cache=url, + ) + + except Exception as e: + logger.error("Error handling downloaded %s: %r", url, e) + # TODO: we really ought to delete the downloaded file in this + # case, since we won't have recorded it in the db, and will + # therefore not expire it. + raise + + return MediaInfo( + media_type=download_result.media_type, + media_length=download_result.length, + download_name=download_result.download_name, + created_ts_ms=time_now_ms, + filesystem_id=file_id, + filename=fname, + uri=download_result.uri, + response_code=download_result.response_code, + expires=download_result.expires, + etag=download_result.etag, + ) + + async def _precache_image_url( + self, user: UserID, media_info: MediaInfo, og: JsonDict + ) -> None: + """ + Pre-cache the image (if one exists) for posterity + + Args: + user: The user requesting the preview. + media_info: The media being previewed. + og: The Open Graph dictionary. This is modified with image information. + """ + # If there's no image or it is blank, there's nothing to do. + if "og:image" not in og: + return + + # Remove the raw image URL, this will be replaced with an MXC URL, if successful. + image_url = og.pop("og:image") + if not image_url: + return + + # The image URL from the HTML might be relative to the previewed page, + # convert it to an URL which can be requested directly. + url_parts = urlparse(image_url) + if url_parts.scheme != "data": + image_url = urljoin(media_info.uri, image_url) + + # FIXME: it might be cleaner to use the same flow as the main /preview_url + # request itself and benefit from the same caching etc. But for now we + # just rely on the caching on the master request to speed things up. + try: + image_info = await self._handle_url(image_url, user, allow_data_urls=True) + except Exception as e: + # Pre-caching the image failed, don't block the entire URL preview. + logger.warning( + "Pre-caching image failed during URL preview: %s errored with %s", + image_url, + e, + ) + return + + if _is_media(image_info.media_type): + # TODO: make sure we don't choke on white-on-transparent images + file_id = image_info.filesystem_id + dims = await self.media_repo._generate_thumbnails( + None, file_id, file_id, image_info.media_type, url_cache=True + ) + if dims: + og["og:image:width"] = dims["width"] + og["og:image:height"] = dims["height"] + else: + logger.warning("Couldn't get dims for %s", image_url) + + og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}" + og["og:image:type"] = image_info.media_type + og["matrix:image:size"] = image_info.media_length + + async def _handle_oembed_response( + self, url: str, media_info: MediaInfo, expiration_ms: int + ) -> Tuple[JsonDict, Optional[str], int]: + """ + Parse the downloaded oEmbed info. + + Args: + url: The URL which is being previewed (not the one which was + requested). + media_info: The media being previewed. + expiration_ms: The length of time, in milliseconds, the media is valid for. + + Returns: + A tuple of: + The Open Graph dictionary, if the oEmbed info can be parsed. + The author name if it could be retrieved from oEmbed. + The (possibly updated) length of time, in milliseconds, the media is valid for. + """ + # If JSON was not returned, there's nothing to do. + if not _is_json(media_info.media_type): + return {}, None, expiration_ms + + with open(media_info.filename, "rb") as file: + body = file.read() + + oembed_response = self._oembed.parse_oembed_response(url, body) + open_graph_result = oembed_response.open_graph_result + + # Use the cache age from the oEmbed result, if one was given. + if open_graph_result and oembed_response.cache_age is not None: + expiration_ms = oembed_response.cache_age + + return open_graph_result, oembed_response.author_name, expiration_ms + + def _start_expire_url_cache_data(self) -> Deferred: + return run_as_background_process( + "expire_url_cache_data", self._expire_url_cache_data + ) + + async def _expire_url_cache_data(self) -> None: + """Clean up expired url cache content, media and thumbnails.""" + + assert self._worker_run_media_background_jobs + + now = self.clock.time_msec() + + logger.debug("Running url preview cache expiry") + + def try_remove_parent_dirs(dirs: Iterable[str]) -> None: + """Attempt to remove the given chain of parent directories + + Args: + dirs: The list of directory paths to delete, with children appearing + before their parents. + """ + for dir in dirs: + try: + os.rmdir(dir) + except FileNotFoundError: + # Already deleted, continue with deleting the rest + pass + except OSError as e: + # Failed, skip deleting the rest of the parent dirs + if e.errno != errno.ENOTEMPTY: + logger.warning( + "Failed to remove media directory while clearing url preview cache: %r: %s", + dir, + e, + ) + break + + # First we delete expired url cache entries + media_ids = await self.store.get_expired_url_cache(now) + + removed_media = [] + for media_id in media_ids: + fname = self.filepaths.url_cache_filepath(media_id) + try: + os.remove(fname) + except FileNotFoundError: + pass # If the path doesn't exist, meh + except OSError as e: + logger.warning( + "Failed to remove media while clearing url preview cache: %r: %s", + media_id, + e, + ) + continue + + removed_media.append(media_id) + + dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) + try_remove_parent_dirs(dirs) + + await self.store.delete_url_cache(removed_media) + + if removed_media: + logger.debug( + "Deleted %d entries from url preview cache", len(removed_media) + ) + else: + logger.debug("No entries removed from url preview cache") + + # Now we delete old images associated with the url cache. + # These may be cached for a bit on the client (i.e., they + # may have a room open with a preview url thing open). + # So we wait a couple of days before deleting, just in case. + expire_before = now - IMAGE_CACHE_EXPIRY_MS + media_ids = await self.store.get_url_cache_media_before(expire_before) + + removed_media = [] + for media_id in media_ids: + fname = self.filepaths.url_cache_filepath(media_id) + try: + os.remove(fname) + except FileNotFoundError: + pass # If the path doesn't exist, meh + except OSError as e: + logger.warning( + "Failed to remove media from url preview cache: %r: %s", media_id, e + ) + continue + + dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) + try_remove_parent_dirs(dirs) + + thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id) + try: + shutil.rmtree(thumbnail_dir) + except FileNotFoundError: + pass # If the path doesn't exist, meh + except OSError as e: + logger.warning( + "Failed to remove media from url preview cache: %r: %s", media_id, e + ) + continue + + removed_media.append(media_id) + + dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id) + # Note that one of the directories to be deleted has already been + # removed by the `rmtree` above. + try_remove_parent_dirs(dirs) + + await self.store.delete_url_cache_media(removed_media) + + if removed_media: + logger.debug("Deleted %d media from url preview cache", len(removed_media)) + else: + logger.debug("No media removed from url preview cache") + + +def _is_media(content_type: str) -> bool: + return content_type.lower().startswith("image/") + + +def _is_html(content_type: str) -> bool: + content_type = content_type.lower() + return content_type.startswith("text/html") or content_type.startswith( + "application/xhtml" + ) + + +def _is_json(content_type: str) -> bool: + return content_type.lower().startswith("application/json") + + +def _is_previewable(content_type: str) -> bool: + """Returns True for content types for which we will perform URL preview and False + otherwise.""" + + return _is_html(content_type) or _is_media(content_type) or _is_json(content_type) diff --git a/synapse/rest/media/preview_url_resource.py b/synapse/rest/media/preview_url_resource.py index 7ada72875..58513c4be 100644 --- a/synapse/rest/media/preview_url_resource.py +++ b/synapse/rest/media/preview_url_resource.py @@ -12,26 +12,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import datetime -import errno -import fnmatch -import logging -import os -import re -import shutil -import sys -import traceback -from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple -from urllib.parse import urljoin, urlparse, urlsplit -from urllib.request import urlopen -import attr +from typing import TYPE_CHECKING -from twisted.internet.defer import Deferred -from twisted.internet.error import DNSLookupError - -from synapse.api.errors import Codes, SynapseError -from synapse.http.client import SimpleHttpClient from synapse.http.server import ( DirectServeJsonResource, respond_with_json, @@ -39,71 +22,13 @@ from synapse.http.server import ( ) from synapse.http.servlet import parse_integer, parse_string from synapse.http.site import SynapseRequest -from synapse.logging.context import make_deferred_yieldable, run_in_background -from synapse.media._base import FileInfo, get_filename_from_headers from synapse.media.media_storage import MediaStorage -from synapse.media.oembed import OEmbedProvider -from synapse.media.preview_html import decode_body, parse_html_to_open_graph -from synapse.metrics.background_process_metrics import run_as_background_process -from synapse.types import JsonDict, UserID -from synapse.util import json_encoder -from synapse.util.async_helpers import ObservableDeferred -from synapse.util.caches.expiringcache import ExpiringCache -from synapse.util.stringutils import random_string +from synapse.media.url_previewer import UrlPreviewer if TYPE_CHECKING: from synapse.media.media_repository import MediaRepository from synapse.server import HomeServer -logger = logging.getLogger(__name__) - -OG_TAG_NAME_MAXLEN = 50 -OG_TAG_VALUE_MAXLEN = 1000 - -ONE_HOUR = 60 * 60 * 1000 -ONE_DAY = 24 * ONE_HOUR -IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class DownloadResult: - length: int - uri: str - response_code: int - media_type: str - download_name: Optional[str] - expires: int - etag: Optional[str] - - -@attr.s(slots=True, frozen=True, auto_attribs=True) -class MediaInfo: - """ - Information parsed from downloading media being previewed. - """ - - # The Content-Type header of the response. - media_type: str - # The length (in bytes) of the downloaded media. - media_length: int - # The media filename, according to the server. This is parsed from the - # returned headers, if possible. - download_name: Optional[str] - # The time of the preview. - created_ts_ms: int - # Information from the media storage provider about where the file is stored - # on disk. - filesystem_id: str - filename: str - # The URI being previewed. - uri: str - # The HTTP response code. - response_code: int - # The timestamp (in milliseconds) of when this preview expires. - expires: int - # The ETag header of the response. - etag: Optional[str] - class PreviewUrlResource(DirectServeJsonResource): """ @@ -121,54 +46,6 @@ class PreviewUrlResource(DirectServeJsonResource): * The URL metadata must be stored somewhere, rather than just using Matrix itself to store the media. * Matrix cannot be used to distribute the metadata between homeservers. - - When Synapse is asked to preview a URL it does the following: - - 1. Checks against a URL blacklist (defined as `url_preview_url_blacklist` in the - config). - 2. Checks the URL against an in-memory cache and returns the result if it exists. (This - is also used to de-duplicate processing of multiple in-flight requests at once.) - 3. Kicks off a background process to generate a preview: - 1. Checks URL and timestamp against the database cache and returns the result if it - has not expired and was successful (a 2xx return code). - 2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it - does, update the URL to download. - 3. Downloads the URL and stores it into a file via the media storage provider - and saves the local media metadata. - 4. If the media is an image: - 1. Generates thumbnails. - 2. Generates an Open Graph response based on image properties. - 5. If the media is HTML: - 1. Decodes the HTML via the stored file. - 2. Generates an Open Graph response from the HTML. - 3. If a JSON oEmbed URL was found in the HTML via autodiscovery: - 1. Downloads the URL and stores it into a file via the media storage provider - and saves the local media metadata. - 2. Convert the oEmbed response to an Open Graph response. - 3. Override any Open Graph data from the HTML with data from oEmbed. - 4. If an image exists in the Open Graph response: - 1. Downloads the URL and stores it into a file via the media storage - provider and saves the local media metadata. - 2. Generates thumbnails. - 3. Updates the Open Graph response based on image properties. - 6. If the media is JSON and an oEmbed URL was found: - 1. Convert the oEmbed response to an Open Graph response. - 2. If a thumbnail or image is in the oEmbed response: - 1. Downloads the URL and stores it into a file via the media storage - provider and saves the local media metadata. - 2. Generates thumbnails. - 3. Updates the Open Graph response based on image properties. - 7. Stores the result in the database cache. - 4. Returns the result. - - If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or - image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole - does not fail. As much information as possible is returned. - - The in-memory cache expires after 1 hour. - - Expired entries in the database cache (and their associated media files) are - deleted every 10 seconds. The default expiration time is 1 hour from download. """ isLeaf = True @@ -183,48 +60,10 @@ class PreviewUrlResource(DirectServeJsonResource): self.auth = hs.get_auth() self.clock = hs.get_clock() - self.filepaths = media_repo.filepaths - self.max_spider_size = hs.config.media.max_spider_size - self.server_name = hs.hostname - self.store = hs.get_datastores().main - self.client = SimpleHttpClient( - hs, - treq_args={"browser_like_redirects": True}, - ip_whitelist=hs.config.media.url_preview_ip_range_whitelist, - ip_blacklist=hs.config.media.url_preview_ip_range_blacklist, - use_proxy=True, - ) self.media_repo = media_repo - self.primary_base_path = media_repo.primary_base_path self.media_storage = media_storage - self._oembed = OEmbedProvider(hs) - - # We run the background jobs if we're the instance specified (or no - # instance is specified, where we assume there is only one instance - # serving media). - instance_running_jobs = hs.config.media.media_instance_running_background_jobs - self._worker_run_media_background_jobs = ( - instance_running_jobs is None - or instance_running_jobs == hs.get_instance_name() - ) - - self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist - self.url_preview_accept_language = hs.config.media.url_preview_accept_language - - # memory cache mapping urls to an ObservableDeferred returning - # JSON-encoded OG metadata - self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache( - cache_name="url_previews", - clock=self.clock, - # don't spider URLs more often than once an hour - expiry_ms=ONE_HOUR, - ) - - if self._worker_run_media_background_jobs: - self._cleaner_loop = self.clock.looping_call( - self._start_expire_url_cache_data, 10 * 1000 - ) + self._url_previewer = UrlPreviewer(hs, media_repo, media_storage) async def _async_render_OPTIONS(self, request: SynapseRequest) -> None: request.setHeader(b"Allow", b"OPTIONS, GET") @@ -238,632 +77,5 @@ class PreviewUrlResource(DirectServeJsonResource): if ts is None: ts = self.clock.time_msec() - # XXX: we could move this into _do_preview if we wanted. - url_tuple = urlsplit(url) - for entry in self.url_preview_url_blacklist: - match = True - for attrib in entry: - pattern = entry[attrib] - value = getattr(url_tuple, attrib) - logger.debug( - "Matching attrib '%s' with value '%s' against pattern '%s'", - attrib, - value, - pattern, - ) - - if value is None: - match = False - continue - - # Some attributes might not be parsed as strings by urlsplit (such as the - # port, which is parsed as an int). Because we use match functions that - # expect strings, we want to make sure that's what we give them. - value_str = str(value) - - if pattern.startswith("^"): - if not re.match(pattern, value_str): - match = False - continue - else: - if not fnmatch.fnmatch(value_str, pattern): - match = False - continue - if match: - logger.warning("URL %s blocked by url_blacklist entry %s", url, entry) - raise SynapseError( - 403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN - ) - - # the in-memory cache: - # * ensures that only one request is active at a time - # * takes load off the DB for the thundering herds - # * also caches any failures (unlike the DB) so we don't keep - # requesting the same endpoint - - observable = self._cache.get(url) - - if not observable: - download = run_in_background(self._do_preview, url, requester.user, ts) - observable = ObservableDeferred(download, consumeErrors=True) - self._cache[url] = observable - else: - logger.info("Returning cached response") - - og = await make_deferred_yieldable(observable.observe()) + og = await self._url_previewer.preview(url, requester.user, ts) respond_with_json_bytes(request, 200, og, send_cors=True) - - async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes: - """Check the db, and download the URL and build a preview - - Args: - url: The URL to preview. - user: The user requesting the preview. - ts: The timestamp requested for the preview. - - Returns: - json-encoded og data - """ - # check the URL cache in the DB (which will also provide us with - # historical previews, if we have any) - cache_result = await self.store.get_url_cache(url, ts) - if ( - cache_result - and cache_result["expires_ts"] > ts - and cache_result["response_code"] / 100 == 2 - ): - # It may be stored as text in the database, not as bytes (such as - # PostgreSQL). If so, encode it back before handing it on. - og = cache_result["og"] - if isinstance(og, str): - og = og.encode("utf8") - return og - - # If this URL can be accessed via oEmbed, use that instead. - url_to_download = url - oembed_url = self._oembed.get_oembed_url(url) - if oembed_url: - url_to_download = oembed_url - - media_info = await self._handle_url(url_to_download, user) - - logger.debug("got media_info of '%s'", media_info) - - # The number of milliseconds that the response should be considered valid. - expiration_ms = media_info.expires - author_name: Optional[str] = None - - if _is_media(media_info.media_type): - file_id = media_info.filesystem_id - dims = await self.media_repo._generate_thumbnails( - None, file_id, file_id, media_info.media_type, url_cache=True - ) - - og = { - "og:description": media_info.download_name, - "og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}", - "og:image:type": media_info.media_type, - "matrix:image:size": media_info.media_length, - } - - if dims: - og["og:image:width"] = dims["width"] - og["og:image:height"] = dims["height"] - else: - logger.warning("Couldn't get dims for %s" % url) - - # define our OG response for this media - elif _is_html(media_info.media_type): - # TODO: somehow stop a big HTML tree from exploding synapse's RAM - - with open(media_info.filename, "rb") as file: - body = file.read() - - tree = decode_body(body, media_info.uri, media_info.media_type) - if tree is not None: - # Check if this HTML document points to oEmbed information and - # defer to that. - oembed_url = self._oembed.autodiscover_from_html(tree) - og_from_oembed: JsonDict = {} - if oembed_url: - try: - oembed_info = await self._handle_url( - oembed_url, user, allow_data_urls=True - ) - except Exception as e: - # Fetching the oEmbed info failed, don't block the entire URL preview. - logger.warning( - "oEmbed fetch failed during URL preview: %s errored with %s", - oembed_url, - e, - ) - else: - ( - og_from_oembed, - author_name, - expiration_ms, - ) = await self._handle_oembed_response( - url, oembed_info, expiration_ms - ) - - # Parse Open Graph information from the HTML in case the oEmbed - # response failed or is incomplete. - og_from_html = parse_html_to_open_graph(tree) - - # Compile the Open Graph response by using the scraped - # information from the HTML and overlaying any information - # from the oEmbed response. - og = {**og_from_html, **og_from_oembed} - - await self._precache_image_url(user, media_info, og) - else: - og = {} - - elif oembed_url: - # Handle the oEmbed information. - og, author_name, expiration_ms = await self._handle_oembed_response( - url, media_info, expiration_ms - ) - await self._precache_image_url(user, media_info, og) - - else: - logger.warning("Failed to find any OG data in %s", url) - og = {} - - # If we don't have a title but we have author_name, copy it as - # title - if not og.get("og:title") and author_name: - og["og:title"] = author_name - - # filter out any stupidly long values - keys_to_remove = [] - for k, v in og.items(): - # values can be numeric as well as strings, hence the cast to str - if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN: - logger.warning( - "Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN] - ) - keys_to_remove.append(k) - for k in keys_to_remove: - del og[k] - - logger.debug("Calculated OG for %s as %s", url, og) - - jsonog = json_encoder.encode(og) - - # Cap the amount of time to consider a response valid. - expiration_ms = min(expiration_ms, ONE_DAY) - - # store OG in history-aware DB cache - await self.store.store_url_cache( - url, - media_info.response_code, - media_info.etag, - media_info.created_ts_ms + expiration_ms, - jsonog, - media_info.filesystem_id, - media_info.created_ts_ms, - ) - - return jsonog.encode("utf8") - - async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult: - """ - Fetches a remote URL and parses the headers. - - Args: - url: The URL to fetch. - output_stream: The stream to write the content to. - - Returns: - A tuple of: - Media length, URL downloaded, the HTTP response code, - the media type, the downloaded file name, the number of - milliseconds the result is valid for, the etag header. - """ - - try: - logger.debug("Trying to get preview for url '%s'", url) - length, headers, uri, code = await self.client.get_file( - url, - output_stream=output_stream, - max_size=self.max_spider_size, - headers={ - b"Accept-Language": self.url_preview_accept_language, - # Use a custom user agent for the preview because some sites will only return - # Open Graph metadata to crawler user agents. Omit the Synapse version - # string to avoid leaking information. - b"User-Agent": [ - "Synapse (bot; +https://github.com/matrix-org/synapse)" - ], - }, - is_allowed_content_type=_is_previewable, - ) - except SynapseError: - # Pass SynapseErrors through directly, so that the servlet - # handler will return a SynapseError to the client instead of - # blank data or a 500. - raise - except DNSLookupError: - # DNS lookup returned no results - # Note: This will also be the case if one of the resolved IP - # addresses is blacklisted - raise SynapseError( - 502, - "DNS resolution failure during URL preview generation", - Codes.UNKNOWN, - ) - except Exception as e: - # FIXME: pass through 404s and other error messages nicely - logger.warning("Error downloading %s: %r", url, e) - - raise SynapseError( - 500, - "Failed to download content: %s" - % (traceback.format_exception_only(sys.exc_info()[0], e),), - Codes.UNKNOWN, - ) - - if b"Content-Type" in headers: - media_type = headers[b"Content-Type"][0].decode("ascii") - else: - media_type = "application/octet-stream" - - download_name = get_filename_from_headers(headers) - - # FIXME: we should calculate a proper expiration based on the - # Cache-Control and Expire headers. But for now, assume 1 hour. - expires = ONE_HOUR - etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None - - return DownloadResult( - length, uri, code, media_type, download_name, expires, etag - ) - - async def _parse_data_url( - self, url: str, output_stream: BinaryIO - ) -> DownloadResult: - """ - Parses a data: URL. - - Args: - url: The URL to parse. - output_stream: The stream to write the content to. - - Returns: - A tuple of: - Media length, URL downloaded, the HTTP response code, - the media type, the downloaded file name, the number of - milliseconds the result is valid for, the etag header. - """ - - try: - logger.debug("Trying to parse data url '%s'", url) - with urlopen(url) as url_info: - # TODO Can this be more efficient. - output_stream.write(url_info.read()) - except Exception as e: - logger.warning("Error parsing data: URL %s: %r", url, e) - - raise SynapseError( - 500, - "Failed to parse data URL: %s" - % (traceback.format_exception_only(sys.exc_info()[0], e),), - Codes.UNKNOWN, - ) - - return DownloadResult( - # Read back the length that has been written. - length=output_stream.tell(), - uri=url, - # If it was parsed, consider this a 200 OK. - response_code=200, - # urlopen shoves the media-type from the data URL into the content type - # header object. - media_type=url_info.headers.get_content_type(), - # Some features are not supported by data: URLs. - download_name=None, - expires=ONE_HOUR, - etag=None, - ) - - async def _handle_url( - self, url: str, user: UserID, allow_data_urls: bool = False - ) -> MediaInfo: - """ - Fetches content from a URL and parses the result to generate a MediaInfo. - - It uses the media storage provider to persist the fetched content and - stores the mapping into the database. - - Args: - url: The URL to fetch. - user: The user who ahs requested this URL. - allow_data_urls: True if data URLs should be allowed. - - Returns: - A MediaInfo object describing the fetched content. - """ - - # TODO: we should probably honour robots.txt... except in practice - # we're most likely being explicitly triggered by a human rather than a - # bot, so are we really a robot? - - file_id = datetime.date.today().isoformat() + "_" + random_string(16) - - file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) - - with self.media_storage.store_into_file(file_info) as (f, fname, finish): - if url.startswith("data:"): - if not allow_data_urls: - raise SynapseError( - 500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN - ) - - download_result = await self._parse_data_url(url, f) - else: - download_result = await self._download_url(url, f) - - await finish() - - try: - time_now_ms = self.clock.time_msec() - - await self.store.store_local_media( - media_id=file_id, - media_type=download_result.media_type, - time_now_ms=time_now_ms, - upload_name=download_result.download_name, - media_length=download_result.length, - user_id=user, - url_cache=url, - ) - - except Exception as e: - logger.error("Error handling downloaded %s: %r", url, e) - # TODO: we really ought to delete the downloaded file in this - # case, since we won't have recorded it in the db, and will - # therefore not expire it. - raise - - return MediaInfo( - media_type=download_result.media_type, - media_length=download_result.length, - download_name=download_result.download_name, - created_ts_ms=time_now_ms, - filesystem_id=file_id, - filename=fname, - uri=download_result.uri, - response_code=download_result.response_code, - expires=download_result.expires, - etag=download_result.etag, - ) - - async def _precache_image_url( - self, user: UserID, media_info: MediaInfo, og: JsonDict - ) -> None: - """ - Pre-cache the image (if one exists) for posterity - - Args: - user: The user requesting the preview. - media_info: The media being previewed. - og: The Open Graph dictionary. This is modified with image information. - """ - # If there's no image or it is blank, there's nothing to do. - if "og:image" not in og: - return - - # Remove the raw image URL, this will be replaced with an MXC URL, if successful. - image_url = og.pop("og:image") - if not image_url: - return - - # The image URL from the HTML might be relative to the previewed page, - # convert it to an URL which can be requested directly. - url_parts = urlparse(image_url) - if url_parts.scheme != "data": - image_url = urljoin(media_info.uri, image_url) - - # FIXME: it might be cleaner to use the same flow as the main /preview_url - # request itself and benefit from the same caching etc. But for now we - # just rely on the caching on the master request to speed things up. - try: - image_info = await self._handle_url(image_url, user, allow_data_urls=True) - except Exception as e: - # Pre-caching the image failed, don't block the entire URL preview. - logger.warning( - "Pre-caching image failed during URL preview: %s errored with %s", - image_url, - e, - ) - return - - if _is_media(image_info.media_type): - # TODO: make sure we don't choke on white-on-transparent images - file_id = image_info.filesystem_id - dims = await self.media_repo._generate_thumbnails( - None, file_id, file_id, image_info.media_type, url_cache=True - ) - if dims: - og["og:image:width"] = dims["width"] - og["og:image:height"] = dims["height"] - else: - logger.warning("Couldn't get dims for %s", image_url) - - og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}" - og["og:image:type"] = image_info.media_type - og["matrix:image:size"] = image_info.media_length - - async def _handle_oembed_response( - self, url: str, media_info: MediaInfo, expiration_ms: int - ) -> Tuple[JsonDict, Optional[str], int]: - """ - Parse the downloaded oEmbed info. - - Args: - url: The URL which is being previewed (not the one which was - requested). - media_info: The media being previewed. - expiration_ms: The length of time, in milliseconds, the media is valid for. - - Returns: - A tuple of: - The Open Graph dictionary, if the oEmbed info can be parsed. - The author name if it could be retrieved from oEmbed. - The (possibly updated) length of time, in milliseconds, the media is valid for. - """ - # If JSON was not returned, there's nothing to do. - if not _is_json(media_info.media_type): - return {}, None, expiration_ms - - with open(media_info.filename, "rb") as file: - body = file.read() - - oembed_response = self._oembed.parse_oembed_response(url, body) - open_graph_result = oembed_response.open_graph_result - - # Use the cache age from the oEmbed result, if one was given. - if open_graph_result and oembed_response.cache_age is not None: - expiration_ms = oembed_response.cache_age - - return open_graph_result, oembed_response.author_name, expiration_ms - - def _start_expire_url_cache_data(self) -> Deferred: - return run_as_background_process( - "expire_url_cache_data", self._expire_url_cache_data - ) - - async def _expire_url_cache_data(self) -> None: - """Clean up expired url cache content, media and thumbnails.""" - - assert self._worker_run_media_background_jobs - - now = self.clock.time_msec() - - logger.debug("Running url preview cache expiry") - - def try_remove_parent_dirs(dirs: Iterable[str]) -> None: - """Attempt to remove the given chain of parent directories - - Args: - dirs: The list of directory paths to delete, with children appearing - before their parents. - """ - for dir in dirs: - try: - os.rmdir(dir) - except FileNotFoundError: - # Already deleted, continue with deleting the rest - pass - except OSError as e: - # Failed, skip deleting the rest of the parent dirs - if e.errno != errno.ENOTEMPTY: - logger.warning( - "Failed to remove media directory while clearing url preview cache: %r: %s", - dir, - e, - ) - break - - # First we delete expired url cache entries - media_ids = await self.store.get_expired_url_cache(now) - - removed_media = [] - for media_id in media_ids: - fname = self.filepaths.url_cache_filepath(media_id) - try: - os.remove(fname) - except FileNotFoundError: - pass # If the path doesn't exist, meh - except OSError as e: - logger.warning( - "Failed to remove media while clearing url preview cache: %r: %s", - media_id, - e, - ) - continue - - removed_media.append(media_id) - - dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) - try_remove_parent_dirs(dirs) - - await self.store.delete_url_cache(removed_media) - - if removed_media: - logger.debug( - "Deleted %d entries from url preview cache", len(removed_media) - ) - else: - logger.debug("No entries removed from url preview cache") - - # Now we delete old images associated with the url cache. - # These may be cached for a bit on the client (i.e., they - # may have a room open with a preview url thing open). - # So we wait a couple of days before deleting, just in case. - expire_before = now - IMAGE_CACHE_EXPIRY_MS - media_ids = await self.store.get_url_cache_media_before(expire_before) - - removed_media = [] - for media_id in media_ids: - fname = self.filepaths.url_cache_filepath(media_id) - try: - os.remove(fname) - except FileNotFoundError: - pass # If the path doesn't exist, meh - except OSError as e: - logger.warning( - "Failed to remove media from url preview cache: %r: %s", media_id, e - ) - continue - - dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id) - try_remove_parent_dirs(dirs) - - thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id) - try: - shutil.rmtree(thumbnail_dir) - except FileNotFoundError: - pass # If the path doesn't exist, meh - except OSError as e: - logger.warning( - "Failed to remove media from url preview cache: %r: %s", media_id, e - ) - continue - - removed_media.append(media_id) - - dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id) - # Note that one of the directories to be deleted has already been - # removed by the `rmtree` above. - try_remove_parent_dirs(dirs) - - await self.store.delete_url_cache_media(removed_media) - - if removed_media: - logger.debug("Deleted %d media from url preview cache", len(removed_media)) - else: - logger.debug("No media removed from url preview cache") - - -def _is_media(content_type: str) -> bool: - return content_type.lower().startswith("image/") - - -def _is_html(content_type: str) -> bool: - content_type = content_type.lower() - return content_type.startswith("text/html") or content_type.startswith( - "application/xhtml" - ) - - -def _is_json(content_type: str) -> bool: - return content_type.lower().startswith("application/json") - - -def _is_previewable(content_type: str) -> bool: - """Returns True for content types for which we will perform URL preview and False - otherwise.""" - - return _is_html(content_type) or _is_media(content_type) or _is_json(content_type) diff --git a/tests/rest/media/test_url_preview.py b/tests/rest/media/test_url_preview.py index e91dc581c..e44beae8c 100644 --- a/tests/rest/media/test_url_preview.py +++ b/tests/rest/media/test_url_preview.py @@ -26,8 +26,8 @@ from twisted.internet.interfaces import IAddress, IResolutionReceiver from twisted.test.proto_helpers import AccumulatingProtocol, MemoryReactor from synapse.config.oembed import OEmbedEndpointConfig +from synapse.media.url_previewer import IMAGE_CACHE_EXPIRY_MS from synapse.rest.media.media_repository_resource import MediaRepositoryResource -from synapse.rest.media.preview_url_resource import IMAGE_CACHE_EXPIRY_MS from synapse.server import HomeServer from synapse.types import JsonDict from synapse.util import Clock @@ -36,7 +36,6 @@ from synapse.util.stringutils import parse_and_validate_mxc_uri from tests import unittest from tests.server import FakeTransport from tests.test_utils import SMALL_PNG -from tests.utils import MockClock try: import lxml @@ -117,8 +116,9 @@ class URLPreviewTests(unittest.HomeserverTestCase): return hs def prepare(self, reactor: MemoryReactor, clock: Clock, hs: HomeServer) -> None: - self.media_repo = hs.get_media_repository_resource() - self.preview_url = self.media_repo.children[b"preview_url"] + self.media_repo = hs.get_media_repository() + media_repo_resource = hs.get_media_repository_resource() + self.preview_url = media_repo_resource.children[b"preview_url"] self.lookups: Dict[str, Any] = {} @@ -193,9 +193,9 @@ class URLPreviewTests(unittest.HomeserverTestCase): ) # Clear the in-memory cache - self.assertIn("http://matrix.org", self.preview_url._cache) - self.preview_url._cache.pop("http://matrix.org") - self.assertNotIn("http://matrix.org", self.preview_url._cache) + self.assertIn("http://matrix.org", self.preview_url._url_previewer._cache) + self.preview_url._url_previewer._cache.pop("http://matrix.org") + self.assertNotIn("http://matrix.org", self.preview_url._url_previewer._cache) # Check the database cache returns the correct response channel = self.make_request( @@ -1073,7 +1073,7 @@ class URLPreviewTests(unittest.HomeserverTestCase): """Test that files are not stored in or fetched from storage providers.""" host, media_id = self._download_image() - rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id) + rel_file_path = self.media_repo.filepaths.url_cache_filepath_rel(media_id) media_store_path = os.path.join(self.media_store_path, rel_file_path) storage_provider_path = os.path.join(self.storage_path, rel_file_path) @@ -1116,7 +1116,7 @@ class URLPreviewTests(unittest.HomeserverTestCase): host, media_id = self._download_image() rel_thumbnail_path = ( - self.preview_url.filepaths.url_cache_thumbnail_directory_rel(media_id) + self.media_repo.filepaths.url_cache_thumbnail_directory_rel(media_id) ) media_store_thumbnail_path = os.path.join( self.media_store_path, rel_thumbnail_path @@ -1143,7 +1143,7 @@ class URLPreviewTests(unittest.HomeserverTestCase): self.assertEqual(channel.code, 200) # Remove the original, otherwise thumbnails will regenerate - rel_file_path = self.preview_url.filepaths.url_cache_filepath_rel(media_id) + rel_file_path = self.media_repo.filepaths.url_cache_filepath_rel(media_id) media_store_path = os.path.join(self.media_store_path, rel_file_path) os.remove(media_store_path) @@ -1166,26 +1166,24 @@ class URLPreviewTests(unittest.HomeserverTestCase): def test_cache_expiry(self) -> None: """Test that URL cache files and thumbnails are cleaned up properly on expiry.""" - self.preview_url.clock = MockClock() - _host, media_id = self._download_image() - file_path = self.preview_url.filepaths.url_cache_filepath(media_id) - file_dirs = self.preview_url.filepaths.url_cache_filepath_dirs_to_delete( + file_path = self.media_repo.filepaths.url_cache_filepath(media_id) + file_dirs = self.media_repo.filepaths.url_cache_filepath_dirs_to_delete( media_id ) - thumbnail_dir = self.preview_url.filepaths.url_cache_thumbnail_directory( + thumbnail_dir = self.media_repo.filepaths.url_cache_thumbnail_directory( media_id ) - thumbnail_dirs = self.preview_url.filepaths.url_cache_thumbnail_dirs_to_delete( + thumbnail_dirs = self.media_repo.filepaths.url_cache_thumbnail_dirs_to_delete( media_id ) self.assertTrue(os.path.isfile(file_path)) self.assertTrue(os.path.isdir(thumbnail_dir)) - self.preview_url.clock.advance_time_msec(IMAGE_CACHE_EXPIRY_MS + 1) - self.get_success(self.preview_url._expire_url_cache_data()) + self.reactor.advance(IMAGE_CACHE_EXPIRY_MS * 1000 + 1) + self.get_success(self.preview_url._url_previewer._expire_url_cache_data()) for path in [file_path] + file_dirs + [thumbnail_dir] + thumbnail_dirs: self.assertFalse(