2023-03-20 14:32:26 -04:00
|
|
|
#
|
2023-11-21 15:29:58 -05:00
|
|
|
# This file is licensed under the Affero General Public License (AGPL) version 3.
|
|
|
|
#
|
2024-01-23 06:26:48 -05:00
|
|
|
# Copyright 2020-2023 The Matrix.org Foundation C.I.C.
|
|
|
|
# Copyright 2016 OpenMarket Ltd
|
2023-11-21 15:29:58 -05:00
|
|
|
# Copyright (C) 2023 New Vector, Ltd
|
|
|
|
#
|
|
|
|
# This program is free software: you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU Affero General Public License as
|
|
|
|
# published by the Free Software Foundation, either version 3 of the
|
|
|
|
# License, or (at your option) any later version.
|
|
|
|
#
|
|
|
|
# See the GNU Affero General Public License for more details:
|
|
|
|
# <https://www.gnu.org/licenses/agpl-3.0.html>.
|
|
|
|
#
|
|
|
|
# Originally licensed under the Apache License, Version 2.0:
|
|
|
|
# <http://www.apache.org/licenses/LICENSE-2.0>.
|
|
|
|
#
|
|
|
|
# [This file includes modifications made by New Vector Limited]
|
2023-03-20 14:32:26 -04:00
|
|
|
#
|
|
|
|
#
|
|
|
|
import datetime
|
|
|
|
import errno
|
|
|
|
import fnmatch
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import shutil
|
|
|
|
import sys
|
|
|
|
import traceback
|
|
|
|
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
|
|
|
|
from urllib.parse import urljoin, urlparse, urlsplit
|
|
|
|
from urllib.request import urlopen
|
|
|
|
|
|
|
|
import attr
|
|
|
|
|
|
|
|
from twisted.internet.defer import Deferred
|
|
|
|
from twisted.internet.error import DNSLookupError
|
|
|
|
|
|
|
|
from synapse.api.errors import Codes, SynapseError
|
|
|
|
from synapse.http.client import SimpleHttpClient
|
|
|
|
from synapse.logging.context import make_deferred_yieldable, run_in_background
|
|
|
|
from synapse.media._base import FileInfo, get_filename_from_headers
|
|
|
|
from synapse.media.media_storage import MediaStorage
|
|
|
|
from synapse.media.oembed import OEmbedProvider
|
|
|
|
from synapse.media.preview_html import decode_body, parse_html_to_open_graph
|
|
|
|
from synapse.metrics.background_process_metrics import run_as_background_process
|
|
|
|
from synapse.types import JsonDict, UserID
|
|
|
|
from synapse.util import json_encoder
|
|
|
|
from synapse.util.async_helpers import ObservableDeferred
|
|
|
|
from synapse.util.caches.expiringcache import ExpiringCache
|
|
|
|
from synapse.util.stringutils import random_string
|
|
|
|
|
|
|
|
if TYPE_CHECKING:
|
|
|
|
from synapse.media.media_repository import MediaRepository
|
|
|
|
from synapse.server import HomeServer
|
|
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
OG_TAG_NAME_MAXLEN = 50
|
|
|
|
OG_TAG_VALUE_MAXLEN = 1000
|
|
|
|
|
|
|
|
ONE_HOUR = 60 * 60 * 1000
|
|
|
|
ONE_DAY = 24 * ONE_HOUR
|
|
|
|
IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
|
|
|
|
|
|
|
|
|
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
|
|
|
class DownloadResult:
|
|
|
|
length: int
|
|
|
|
uri: str
|
|
|
|
response_code: int
|
|
|
|
media_type: str
|
|
|
|
download_name: Optional[str]
|
|
|
|
expires: int
|
|
|
|
etag: Optional[str]
|
|
|
|
|
|
|
|
|
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
|
|
|
class MediaInfo:
|
|
|
|
"""
|
|
|
|
Information parsed from downloading media being previewed.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# The Content-Type header of the response.
|
|
|
|
media_type: str
|
|
|
|
# The length (in bytes) of the downloaded media.
|
|
|
|
media_length: int
|
|
|
|
# The media filename, according to the server. This is parsed from the
|
|
|
|
# returned headers, if possible.
|
|
|
|
download_name: Optional[str]
|
|
|
|
# The time of the preview.
|
|
|
|
created_ts_ms: int
|
|
|
|
# Information from the media storage provider about where the file is stored
|
|
|
|
# on disk.
|
|
|
|
filesystem_id: str
|
|
|
|
filename: str
|
|
|
|
# The URI being previewed.
|
|
|
|
uri: str
|
|
|
|
# The HTTP response code.
|
|
|
|
response_code: int
|
|
|
|
# The timestamp (in milliseconds) of when this preview expires.
|
|
|
|
expires: int
|
|
|
|
# The ETag header of the response.
|
|
|
|
etag: Optional[str]
|
|
|
|
|
|
|
|
|
|
|
|
class UrlPreviewer:
|
|
|
|
"""
|
|
|
|
Generates an Open Graph (https://ogp.me/) responses (with some Matrix
|
|
|
|
specific additions) for a given URL.
|
|
|
|
|
|
|
|
When Synapse is asked to preview a URL it does the following:
|
|
|
|
|
2023-05-19 08:25:25 -04:00
|
|
|
1. Checks against a URL blocklist (defined as `url_preview_url_blacklist` in the
|
2023-03-20 14:32:26 -04:00
|
|
|
config).
|
|
|
|
2. Checks the URL against an in-memory cache and returns the result if it exists. (This
|
|
|
|
is also used to de-duplicate processing of multiple in-flight requests at once.)
|
|
|
|
3. Kicks off a background process to generate a preview:
|
|
|
|
1. Checks URL and timestamp against the database cache and returns the result if it
|
|
|
|
has not expired and was successful (a 2xx return code).
|
|
|
|
2. Checks if the URL matches an oEmbed (https://oembed.com/) pattern. If it
|
2023-05-16 16:25:01 -04:00
|
|
|
does and the new URL is not blocked, update the URL to download.
|
2023-03-20 14:32:26 -04:00
|
|
|
3. Downloads the URL and stores it into a file via the media storage provider
|
|
|
|
and saves the local media metadata.
|
|
|
|
4. If the media is an image:
|
|
|
|
1. Generates thumbnails.
|
|
|
|
2. Generates an Open Graph response based on image properties.
|
|
|
|
5. If the media is HTML:
|
|
|
|
1. Decodes the HTML via the stored file.
|
|
|
|
2. Generates an Open Graph response from the HTML.
|
|
|
|
3. If a JSON oEmbed URL was found in the HTML via autodiscovery:
|
|
|
|
1. Downloads the URL and stores it into a file via the media storage provider
|
|
|
|
and saves the local media metadata.
|
|
|
|
2. Convert the oEmbed response to an Open Graph response.
|
|
|
|
3. Override any Open Graph data from the HTML with data from oEmbed.
|
2023-05-16 16:25:01 -04:00
|
|
|
4. If an image URL exists in the Open Graph response:
|
2023-03-20 14:32:26 -04:00
|
|
|
1. Downloads the URL and stores it into a file via the media storage
|
|
|
|
provider and saves the local media metadata.
|
|
|
|
2. Generates thumbnails.
|
|
|
|
3. Updates the Open Graph response based on image properties.
|
2023-05-16 16:25:01 -04:00
|
|
|
6. If an oEmbed URL was found and the media is JSON:
|
2023-03-20 14:32:26 -04:00
|
|
|
1. Convert the oEmbed response to an Open Graph response.
|
2023-05-16 16:25:01 -04:00
|
|
|
2. If an image URL is in the oEmbed response:
|
2023-03-20 14:32:26 -04:00
|
|
|
1. Downloads the URL and stores it into a file via the media storage
|
|
|
|
provider and saves the local media metadata.
|
|
|
|
2. Generates thumbnails.
|
|
|
|
3. Updates the Open Graph response based on image properties.
|
|
|
|
7. Stores the result in the database cache.
|
|
|
|
4. Returns the result.
|
|
|
|
|
|
|
|
If any additional requests (e.g. from oEmbed autodiscovery, step 5.3 or
|
|
|
|
image thumbnailing, step 5.4 or 6.4) fails then the URL preview as a whole
|
2023-05-16 16:25:01 -04:00
|
|
|
does not fail. If any of them are blocked, then those additional requests
|
|
|
|
are skipped. As much information as possible is returned.
|
2023-03-20 14:32:26 -04:00
|
|
|
|
|
|
|
The in-memory cache expires after 1 hour.
|
|
|
|
|
|
|
|
Expired entries in the database cache (and their associated media files) are
|
|
|
|
deleted every 10 seconds. The default expiration time is 1 hour from download.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
hs: "HomeServer",
|
|
|
|
media_repo: "MediaRepository",
|
|
|
|
media_storage: MediaStorage,
|
|
|
|
):
|
|
|
|
self.clock = hs.get_clock()
|
|
|
|
self.filepaths = media_repo.filepaths
|
|
|
|
self.max_spider_size = hs.config.media.max_spider_size
|
|
|
|
self.server_name = hs.hostname
|
|
|
|
self.store = hs.get_datastores().main
|
|
|
|
self.client = SimpleHttpClient(
|
|
|
|
hs,
|
|
|
|
treq_args={"browser_like_redirects": True},
|
2023-05-19 08:25:25 -04:00
|
|
|
ip_allowlist=hs.config.media.url_preview_ip_range_allowlist,
|
|
|
|
ip_blocklist=hs.config.media.url_preview_ip_range_blocklist,
|
2023-03-20 14:32:26 -04:00
|
|
|
use_proxy=True,
|
|
|
|
)
|
|
|
|
self.media_repo = media_repo
|
|
|
|
self.primary_base_path = media_repo.primary_base_path
|
|
|
|
self.media_storage = media_storage
|
|
|
|
|
|
|
|
self._oembed = OEmbedProvider(hs)
|
|
|
|
|
|
|
|
# We run the background jobs if we're the instance specified (or no
|
|
|
|
# instance is specified, where we assume there is only one instance
|
|
|
|
# serving media).
|
|
|
|
instance_running_jobs = hs.config.media.media_instance_running_background_jobs
|
|
|
|
self._worker_run_media_background_jobs = (
|
|
|
|
instance_running_jobs is None
|
|
|
|
or instance_running_jobs == hs.get_instance_name()
|
|
|
|
)
|
|
|
|
|
2023-05-19 08:25:25 -04:00
|
|
|
self.url_preview_url_blocklist = hs.config.media.url_preview_url_blocklist
|
2023-03-20 14:32:26 -04:00
|
|
|
self.url_preview_accept_language = hs.config.media.url_preview_accept_language
|
|
|
|
|
|
|
|
# memory cache mapping urls to an ObservableDeferred returning
|
|
|
|
# JSON-encoded OG metadata
|
|
|
|
self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
|
|
|
|
cache_name="url_previews",
|
|
|
|
clock=self.clock,
|
|
|
|
# don't spider URLs more often than once an hour
|
|
|
|
expiry_ms=ONE_HOUR,
|
|
|
|
)
|
|
|
|
|
|
|
|
if self._worker_run_media_background_jobs:
|
|
|
|
self._cleaner_loop = self.clock.looping_call(
|
|
|
|
self._start_expire_url_cache_data, 10 * 1000
|
|
|
|
)
|
|
|
|
|
|
|
|
async def preview(self, url: str, user: UserID, ts: int) -> bytes:
|
|
|
|
# the in-memory cache:
|
2023-05-16 16:25:01 -04:00
|
|
|
# * ensures that only one request to a URL is active at a time
|
2023-03-20 14:32:26 -04:00
|
|
|
# * takes load off the DB for the thundering herds
|
|
|
|
# * also caches any failures (unlike the DB) so we don't keep
|
2023-05-16 16:25:01 -04:00
|
|
|
# requesting the same endpoint
|
|
|
|
#
|
|
|
|
# Note that autodiscovered oEmbed URLs and pre-caching of images
|
|
|
|
# are not captured in the in-memory cache.
|
2023-03-20 14:32:26 -04:00
|
|
|
|
|
|
|
observable = self._cache.get(url)
|
|
|
|
|
|
|
|
if not observable:
|
|
|
|
download = run_in_background(self._do_preview, url, user, ts)
|
|
|
|
observable = ObservableDeferred(download, consumeErrors=True)
|
|
|
|
self._cache[url] = observable
|
|
|
|
else:
|
|
|
|
logger.info("Returning cached response")
|
|
|
|
|
|
|
|
return await make_deferred_yieldable(observable.observe())
|
|
|
|
|
|
|
|
async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
|
|
|
|
"""Check the db, and download the URL and build a preview
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL to preview.
|
|
|
|
user: The user requesting the preview.
|
|
|
|
ts: The timestamp requested for the preview.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
json-encoded og data
|
|
|
|
"""
|
|
|
|
# check the URL cache in the DB (which will also provide us with
|
|
|
|
# historical previews, if we have any)
|
|
|
|
cache_result = await self.store.get_url_cache(url, ts)
|
|
|
|
if (
|
|
|
|
cache_result
|
2023-11-09 11:00:30 -05:00
|
|
|
and cache_result.expires_ts > ts
|
|
|
|
and cache_result.response_code // 100 == 2
|
2023-03-20 14:32:26 -04:00
|
|
|
):
|
|
|
|
# It may be stored as text in the database, not as bytes (such as
|
|
|
|
# PostgreSQL). If so, encode it back before handing it on.
|
2023-11-09 11:00:30 -05:00
|
|
|
if isinstance(cache_result.og, str):
|
|
|
|
return cache_result.og.encode("utf8")
|
|
|
|
return cache_result.og
|
2023-03-20 14:32:26 -04:00
|
|
|
|
2023-05-16 16:25:01 -04:00
|
|
|
# If this URL can be accessed via an allowed oEmbed, use that instead.
|
2023-03-20 14:32:26 -04:00
|
|
|
url_to_download = url
|
|
|
|
oembed_url = self._oembed.get_oembed_url(url)
|
|
|
|
if oembed_url:
|
|
|
|
url_to_download = oembed_url
|
|
|
|
|
|
|
|
media_info = await self._handle_url(url_to_download, user)
|
|
|
|
|
|
|
|
logger.debug("got media_info of '%s'", media_info)
|
|
|
|
|
|
|
|
# The number of milliseconds that the response should be considered valid.
|
|
|
|
expiration_ms = media_info.expires
|
|
|
|
author_name: Optional[str] = None
|
|
|
|
|
|
|
|
if _is_media(media_info.media_type):
|
|
|
|
file_id = media_info.filesystem_id
|
|
|
|
dims = await self.media_repo._generate_thumbnails(
|
|
|
|
None, file_id, file_id, media_info.media_type, url_cache=True
|
|
|
|
)
|
|
|
|
|
|
|
|
og = {
|
|
|
|
"og:description": media_info.download_name,
|
|
|
|
"og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
|
|
|
|
"og:image:type": media_info.media_type,
|
|
|
|
"matrix:image:size": media_info.media_length,
|
|
|
|
}
|
|
|
|
|
|
|
|
if dims:
|
|
|
|
og["og:image:width"] = dims["width"]
|
|
|
|
og["og:image:height"] = dims["height"]
|
|
|
|
else:
|
|
|
|
logger.warning("Couldn't get dims for %s" % url)
|
|
|
|
|
|
|
|
# define our OG response for this media
|
|
|
|
elif _is_html(media_info.media_type):
|
|
|
|
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
|
|
|
|
|
|
|
with open(media_info.filename, "rb") as file:
|
|
|
|
body = file.read()
|
|
|
|
|
|
|
|
tree = decode_body(body, media_info.uri, media_info.media_type)
|
|
|
|
if tree is not None:
|
|
|
|
# Check if this HTML document points to oEmbed information and
|
|
|
|
# defer to that.
|
|
|
|
oembed_url = self._oembed.autodiscover_from_html(tree)
|
|
|
|
og_from_oembed: JsonDict = {}
|
2023-05-16 16:25:01 -04:00
|
|
|
# Only download to the oEmbed URL if it is allowed.
|
2023-03-20 14:32:26 -04:00
|
|
|
if oembed_url:
|
|
|
|
try:
|
|
|
|
oembed_info = await self._handle_url(
|
|
|
|
oembed_url, user, allow_data_urls=True
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
# Fetching the oEmbed info failed, don't block the entire URL preview.
|
|
|
|
logger.warning(
|
|
|
|
"oEmbed fetch failed during URL preview: %s errored with %s",
|
|
|
|
oembed_url,
|
|
|
|
e,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
(
|
|
|
|
og_from_oembed,
|
|
|
|
author_name,
|
|
|
|
expiration_ms,
|
|
|
|
) = await self._handle_oembed_response(
|
|
|
|
url, oembed_info, expiration_ms
|
|
|
|
)
|
|
|
|
|
|
|
|
# Parse Open Graph information from the HTML in case the oEmbed
|
|
|
|
# response failed or is incomplete.
|
|
|
|
og_from_html = parse_html_to_open_graph(tree)
|
|
|
|
|
|
|
|
# Compile the Open Graph response by using the scraped
|
|
|
|
# information from the HTML and overlaying any information
|
|
|
|
# from the oEmbed response.
|
|
|
|
og = {**og_from_html, **og_from_oembed}
|
|
|
|
|
|
|
|
await self._precache_image_url(user, media_info, og)
|
|
|
|
else:
|
|
|
|
og = {}
|
|
|
|
|
|
|
|
elif oembed_url:
|
|
|
|
# Handle the oEmbed information.
|
|
|
|
og, author_name, expiration_ms = await self._handle_oembed_response(
|
|
|
|
url, media_info, expiration_ms
|
|
|
|
)
|
|
|
|
await self._precache_image_url(user, media_info, og)
|
|
|
|
|
|
|
|
else:
|
|
|
|
logger.warning("Failed to find any OG data in %s", url)
|
|
|
|
og = {}
|
|
|
|
|
|
|
|
# If we don't have a title but we have author_name, copy it as
|
|
|
|
# title
|
|
|
|
if not og.get("og:title") and author_name:
|
|
|
|
og["og:title"] = author_name
|
|
|
|
|
|
|
|
# filter out any stupidly long values
|
|
|
|
keys_to_remove = []
|
|
|
|
for k, v in og.items():
|
|
|
|
# values can be numeric as well as strings, hence the cast to str
|
|
|
|
if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
|
|
|
|
logger.warning(
|
|
|
|
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
|
|
|
|
)
|
|
|
|
keys_to_remove.append(k)
|
|
|
|
for k in keys_to_remove:
|
|
|
|
del og[k]
|
|
|
|
|
|
|
|
logger.debug("Calculated OG for %s as %s", url, og)
|
|
|
|
|
|
|
|
jsonog = json_encoder.encode(og)
|
|
|
|
|
|
|
|
# Cap the amount of time to consider a response valid.
|
|
|
|
expiration_ms = min(expiration_ms, ONE_DAY)
|
|
|
|
|
|
|
|
# store OG in history-aware DB cache
|
|
|
|
await self.store.store_url_cache(
|
|
|
|
url,
|
|
|
|
media_info.response_code,
|
|
|
|
media_info.etag,
|
|
|
|
media_info.created_ts_ms + expiration_ms,
|
|
|
|
jsonog,
|
|
|
|
media_info.filesystem_id,
|
|
|
|
media_info.created_ts_ms,
|
|
|
|
)
|
|
|
|
|
|
|
|
return jsonog.encode("utf8")
|
|
|
|
|
2023-05-16 16:25:01 -04:00
|
|
|
def _is_url_blocked(self, url: str) -> bool:
|
|
|
|
"""
|
|
|
|
Check whether the URL is allowed to be previewed (according to the homeserver
|
|
|
|
configuration).
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The requested URL.
|
|
|
|
|
|
|
|
Return:
|
|
|
|
True if the URL is blocked, False if it is allowed.
|
|
|
|
"""
|
|
|
|
url_tuple = urlsplit(url)
|
2023-05-19 08:25:25 -04:00
|
|
|
for entry in self.url_preview_url_blocklist:
|
2023-05-16 16:25:01 -04:00
|
|
|
match = True
|
|
|
|
# Iterate over each entry. If *all* attributes of that entry match
|
|
|
|
# the current URL, then reject it.
|
|
|
|
for attrib, pattern in entry.items():
|
|
|
|
value = getattr(url_tuple, attrib)
|
|
|
|
logger.debug(
|
|
|
|
"Matching attrib '%s' with value '%s' against pattern '%s'",
|
|
|
|
attrib,
|
|
|
|
value,
|
|
|
|
pattern,
|
|
|
|
)
|
|
|
|
|
|
|
|
if value is None:
|
|
|
|
match = False
|
|
|
|
break
|
|
|
|
|
|
|
|
# Some attributes might not be parsed as strings by urlsplit (such as the
|
|
|
|
# port, which is parsed as an int). Because we use match functions that
|
|
|
|
# expect strings, we want to make sure that's what we give them.
|
|
|
|
value_str = str(value)
|
|
|
|
|
|
|
|
# Check the value against the pattern as either a regular expression or
|
|
|
|
# a glob. If it doesn't match, the entry doesn't match.
|
|
|
|
if pattern.startswith("^"):
|
|
|
|
if not re.match(pattern, value_str):
|
|
|
|
match = False
|
|
|
|
break
|
|
|
|
else:
|
|
|
|
if not fnmatch.fnmatch(value_str, pattern):
|
|
|
|
match = False
|
|
|
|
break
|
|
|
|
|
|
|
|
# All fields matched, return true (the URL is blocked).
|
|
|
|
if match:
|
2023-05-19 08:25:25 -04:00
|
|
|
logger.warning("URL %s blocked by entry %s", url, entry)
|
2023-05-16 16:25:01 -04:00
|
|
|
return match
|
|
|
|
|
|
|
|
# No matches were found, the URL is allowed.
|
|
|
|
return False
|
|
|
|
|
2023-03-20 14:32:26 -04:00
|
|
|
async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
|
|
|
|
"""
|
|
|
|
Fetches a remote URL and parses the headers.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL to fetch.
|
|
|
|
output_stream: The stream to write the content to.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A tuple of:
|
|
|
|
Media length, URL downloaded, the HTTP response code,
|
|
|
|
the media type, the downloaded file name, the number of
|
|
|
|
milliseconds the result is valid for, the etag header.
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
logger.debug("Trying to get preview for url '%s'", url)
|
|
|
|
length, headers, uri, code = await self.client.get_file(
|
|
|
|
url,
|
|
|
|
output_stream=output_stream,
|
|
|
|
max_size=self.max_spider_size,
|
|
|
|
headers={
|
|
|
|
b"Accept-Language": self.url_preview_accept_language,
|
|
|
|
# Use a custom user agent for the preview because some sites will only return
|
|
|
|
# Open Graph metadata to crawler user agents. Omit the Synapse version
|
|
|
|
# string to avoid leaking information.
|
|
|
|
b"User-Agent": [
|
|
|
|
"Synapse (bot; +https://github.com/matrix-org/synapse)"
|
|
|
|
],
|
|
|
|
},
|
|
|
|
is_allowed_content_type=_is_previewable,
|
|
|
|
)
|
|
|
|
except SynapseError:
|
|
|
|
# Pass SynapseErrors through directly, so that the servlet
|
|
|
|
# handler will return a SynapseError to the client instead of
|
|
|
|
# blank data or a 500.
|
|
|
|
raise
|
|
|
|
except DNSLookupError:
|
|
|
|
# DNS lookup returned no results
|
|
|
|
# Note: This will also be the case if one of the resolved IP
|
2023-05-19 08:25:25 -04:00
|
|
|
# addresses is blocked.
|
2023-03-20 14:32:26 -04:00
|
|
|
raise SynapseError(
|
|
|
|
502,
|
|
|
|
"DNS resolution failure during URL preview generation",
|
|
|
|
Codes.UNKNOWN,
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
# FIXME: pass through 404s and other error messages nicely
|
|
|
|
logger.warning("Error downloading %s: %r", url, e)
|
|
|
|
|
|
|
|
raise SynapseError(
|
|
|
|
500,
|
|
|
|
"Failed to download content: %s"
|
|
|
|
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
|
|
|
Codes.UNKNOWN,
|
|
|
|
)
|
|
|
|
|
|
|
|
if b"Content-Type" in headers:
|
|
|
|
media_type = headers[b"Content-Type"][0].decode("ascii")
|
|
|
|
else:
|
|
|
|
media_type = "application/octet-stream"
|
|
|
|
|
|
|
|
download_name = get_filename_from_headers(headers)
|
|
|
|
|
|
|
|
# FIXME: we should calculate a proper expiration based on the
|
|
|
|
# Cache-Control and Expire headers. But for now, assume 1 hour.
|
|
|
|
expires = ONE_HOUR
|
|
|
|
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
|
|
|
|
|
|
|
|
return DownloadResult(
|
|
|
|
length, uri, code, media_type, download_name, expires, etag
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _parse_data_url(
|
|
|
|
self, url: str, output_stream: BinaryIO
|
|
|
|
) -> DownloadResult:
|
|
|
|
"""
|
|
|
|
Parses a data: URL.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL to parse.
|
|
|
|
output_stream: The stream to write the content to.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A tuple of:
|
|
|
|
Media length, URL downloaded, the HTTP response code,
|
|
|
|
the media type, the downloaded file name, the number of
|
|
|
|
milliseconds the result is valid for, the etag header.
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
logger.debug("Trying to parse data url '%s'", url)
|
|
|
|
with urlopen(url) as url_info:
|
|
|
|
# TODO Can this be more efficient.
|
|
|
|
output_stream.write(url_info.read())
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning("Error parsing data: URL %s: %r", url, e)
|
|
|
|
|
|
|
|
raise SynapseError(
|
|
|
|
500,
|
|
|
|
"Failed to parse data URL: %s"
|
|
|
|
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
|
|
|
Codes.UNKNOWN,
|
|
|
|
)
|
|
|
|
|
|
|
|
return DownloadResult(
|
|
|
|
# Read back the length that has been written.
|
|
|
|
length=output_stream.tell(),
|
|
|
|
uri=url,
|
|
|
|
# If it was parsed, consider this a 200 OK.
|
|
|
|
response_code=200,
|
|
|
|
# urlopen shoves the media-type from the data URL into the content type
|
|
|
|
# header object.
|
|
|
|
media_type=url_info.headers.get_content_type(),
|
|
|
|
# Some features are not supported by data: URLs.
|
|
|
|
download_name=None,
|
|
|
|
expires=ONE_HOUR,
|
|
|
|
etag=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _handle_url(
|
|
|
|
self, url: str, user: UserID, allow_data_urls: bool = False
|
|
|
|
) -> MediaInfo:
|
|
|
|
"""
|
|
|
|
Fetches content from a URL and parses the result to generate a MediaInfo.
|
|
|
|
|
|
|
|
It uses the media storage provider to persist the fetched content and
|
|
|
|
stores the mapping into the database.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL to fetch.
|
|
|
|
user: The user who ahs requested this URL.
|
|
|
|
allow_data_urls: True if data URLs should be allowed.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A MediaInfo object describing the fetched content.
|
2023-05-16 16:25:01 -04:00
|
|
|
|
|
|
|
Raises:
|
|
|
|
SynapseError if the URL is blocked.
|
2023-03-20 14:32:26 -04:00
|
|
|
"""
|
|
|
|
|
2023-05-16 16:25:01 -04:00
|
|
|
if self._is_url_blocked(url):
|
|
|
|
raise SynapseError(
|
2023-05-19 08:25:25 -04:00
|
|
|
403, "URL blocked by url pattern blocklist entry", Codes.UNKNOWN
|
2023-05-16 16:25:01 -04:00
|
|
|
)
|
|
|
|
|
2023-03-20 14:32:26 -04:00
|
|
|
# TODO: we should probably honour robots.txt... except in practice
|
|
|
|
# we're most likely being explicitly triggered by a human rather than a
|
|
|
|
# bot, so are we really a robot?
|
|
|
|
|
|
|
|
file_id = datetime.date.today().isoformat() + "_" + random_string(16)
|
|
|
|
|
|
|
|
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
|
|
|
|
|
|
|
|
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
|
|
|
|
if url.startswith("data:"):
|
|
|
|
if not allow_data_urls:
|
|
|
|
raise SynapseError(
|
|
|
|
500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
|
|
|
|
)
|
|
|
|
|
|
|
|
download_result = await self._parse_data_url(url, f)
|
|
|
|
else:
|
|
|
|
download_result = await self._download_url(url, f)
|
|
|
|
|
|
|
|
await finish()
|
|
|
|
|
|
|
|
try:
|
|
|
|
time_now_ms = self.clock.time_msec()
|
|
|
|
|
|
|
|
await self.store.store_local_media(
|
|
|
|
media_id=file_id,
|
|
|
|
media_type=download_result.media_type,
|
|
|
|
time_now_ms=time_now_ms,
|
|
|
|
upload_name=download_result.download_name,
|
|
|
|
media_length=download_result.length,
|
|
|
|
user_id=user,
|
|
|
|
url_cache=url,
|
|
|
|
)
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
logger.error("Error handling downloaded %s: %r", url, e)
|
|
|
|
# TODO: we really ought to delete the downloaded file in this
|
|
|
|
# case, since we won't have recorded it in the db, and will
|
|
|
|
# therefore not expire it.
|
|
|
|
raise
|
|
|
|
|
|
|
|
return MediaInfo(
|
|
|
|
media_type=download_result.media_type,
|
|
|
|
media_length=download_result.length,
|
|
|
|
download_name=download_result.download_name,
|
|
|
|
created_ts_ms=time_now_ms,
|
|
|
|
filesystem_id=file_id,
|
|
|
|
filename=fname,
|
|
|
|
uri=download_result.uri,
|
|
|
|
response_code=download_result.response_code,
|
|
|
|
expires=download_result.expires,
|
|
|
|
etag=download_result.etag,
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _precache_image_url(
|
|
|
|
self, user: UserID, media_info: MediaInfo, og: JsonDict
|
|
|
|
) -> None:
|
|
|
|
"""
|
|
|
|
Pre-cache the image (if one exists) for posterity
|
|
|
|
|
|
|
|
Args:
|
|
|
|
user: The user requesting the preview.
|
|
|
|
media_info: The media being previewed.
|
|
|
|
og: The Open Graph dictionary. This is modified with image information.
|
|
|
|
"""
|
|
|
|
# If there's no image or it is blank, there's nothing to do.
|
|
|
|
if "og:image" not in og:
|
|
|
|
return
|
|
|
|
|
|
|
|
# Remove the raw image URL, this will be replaced with an MXC URL, if successful.
|
|
|
|
image_url = og.pop("og:image")
|
|
|
|
if not image_url:
|
|
|
|
return
|
|
|
|
|
|
|
|
# The image URL from the HTML might be relative to the previewed page,
|
2023-05-16 16:25:01 -04:00
|
|
|
# convert it to a URL which can be requested directly.
|
2023-03-20 14:32:26 -04:00
|
|
|
url_parts = urlparse(image_url)
|
|
|
|
if url_parts.scheme != "data":
|
|
|
|
image_url = urljoin(media_info.uri, image_url)
|
|
|
|
|
|
|
|
# FIXME: it might be cleaner to use the same flow as the main /preview_url
|
|
|
|
# request itself and benefit from the same caching etc. But for now we
|
|
|
|
# just rely on the caching on the master request to speed things up.
|
|
|
|
try:
|
|
|
|
image_info = await self._handle_url(image_url, user, allow_data_urls=True)
|
|
|
|
except Exception as e:
|
|
|
|
# Pre-caching the image failed, don't block the entire URL preview.
|
|
|
|
logger.warning(
|
|
|
|
"Pre-caching image failed during URL preview: %s errored with %s",
|
|
|
|
image_url,
|
|
|
|
e,
|
|
|
|
)
|
|
|
|
return
|
|
|
|
|
|
|
|
if _is_media(image_info.media_type):
|
|
|
|
# TODO: make sure we don't choke on white-on-transparent images
|
|
|
|
file_id = image_info.filesystem_id
|
|
|
|
dims = await self.media_repo._generate_thumbnails(
|
|
|
|
None, file_id, file_id, image_info.media_type, url_cache=True
|
|
|
|
)
|
|
|
|
if dims:
|
|
|
|
og["og:image:width"] = dims["width"]
|
|
|
|
og["og:image:height"] = dims["height"]
|
|
|
|
else:
|
|
|
|
logger.warning("Couldn't get dims for %s", image_url)
|
|
|
|
|
|
|
|
og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
|
|
|
|
og["og:image:type"] = image_info.media_type
|
|
|
|
og["matrix:image:size"] = image_info.media_length
|
|
|
|
|
|
|
|
async def _handle_oembed_response(
|
|
|
|
self, url: str, media_info: MediaInfo, expiration_ms: int
|
|
|
|
) -> Tuple[JsonDict, Optional[str], int]:
|
|
|
|
"""
|
|
|
|
Parse the downloaded oEmbed info.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL which is being previewed (not the one which was
|
|
|
|
requested).
|
|
|
|
media_info: The media being previewed.
|
|
|
|
expiration_ms: The length of time, in milliseconds, the media is valid for.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A tuple of:
|
|
|
|
The Open Graph dictionary, if the oEmbed info can be parsed.
|
|
|
|
The author name if it could be retrieved from oEmbed.
|
|
|
|
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
|
|
|
"""
|
|
|
|
# If JSON was not returned, there's nothing to do.
|
|
|
|
if not _is_json(media_info.media_type):
|
|
|
|
return {}, None, expiration_ms
|
|
|
|
|
|
|
|
with open(media_info.filename, "rb") as file:
|
|
|
|
body = file.read()
|
|
|
|
|
|
|
|
oembed_response = self._oembed.parse_oembed_response(url, body)
|
|
|
|
open_graph_result = oembed_response.open_graph_result
|
|
|
|
|
|
|
|
# Use the cache age from the oEmbed result, if one was given.
|
|
|
|
if open_graph_result and oembed_response.cache_age is not None:
|
|
|
|
expiration_ms = oembed_response.cache_age
|
|
|
|
|
|
|
|
return open_graph_result, oembed_response.author_name, expiration_ms
|
|
|
|
|
|
|
|
def _start_expire_url_cache_data(self) -> Deferred:
|
|
|
|
return run_as_background_process(
|
|
|
|
"expire_url_cache_data", self._expire_url_cache_data
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _expire_url_cache_data(self) -> None:
|
|
|
|
"""Clean up expired url cache content, media and thumbnails."""
|
|
|
|
|
|
|
|
assert self._worker_run_media_background_jobs
|
|
|
|
|
|
|
|
now = self.clock.time_msec()
|
|
|
|
|
|
|
|
logger.debug("Running url preview cache expiry")
|
|
|
|
|
|
|
|
def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
|
|
|
|
"""Attempt to remove the given chain of parent directories
|
|
|
|
|
|
|
|
Args:
|
|
|
|
dirs: The list of directory paths to delete, with children appearing
|
|
|
|
before their parents.
|
|
|
|
"""
|
|
|
|
for dir in dirs:
|
|
|
|
try:
|
|
|
|
os.rmdir(dir)
|
|
|
|
except FileNotFoundError:
|
|
|
|
# Already deleted, continue with deleting the rest
|
|
|
|
pass
|
|
|
|
except OSError as e:
|
|
|
|
# Failed, skip deleting the rest of the parent dirs
|
|
|
|
if e.errno != errno.ENOTEMPTY:
|
|
|
|
logger.warning(
|
|
|
|
"Failed to remove media directory while clearing url preview cache: %r: %s",
|
|
|
|
dir,
|
|
|
|
e,
|
|
|
|
)
|
|
|
|
break
|
|
|
|
|
|
|
|
# First we delete expired url cache entries
|
|
|
|
media_ids = await self.store.get_expired_url_cache(now)
|
|
|
|
|
|
|
|
removed_media = []
|
|
|
|
for media_id in media_ids:
|
|
|
|
fname = self.filepaths.url_cache_filepath(media_id)
|
|
|
|
try:
|
|
|
|
os.remove(fname)
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass # If the path doesn't exist, meh
|
|
|
|
except OSError as e:
|
|
|
|
logger.warning(
|
|
|
|
"Failed to remove media while clearing url preview cache: %r: %s",
|
|
|
|
media_id,
|
|
|
|
e,
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
|
|
|
|
removed_media.append(media_id)
|
|
|
|
|
|
|
|
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
|
|
|
try_remove_parent_dirs(dirs)
|
|
|
|
|
|
|
|
await self.store.delete_url_cache(removed_media)
|
|
|
|
|
|
|
|
if removed_media:
|
|
|
|
logger.debug(
|
|
|
|
"Deleted %d entries from url preview cache", len(removed_media)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
logger.debug("No entries removed from url preview cache")
|
|
|
|
|
|
|
|
# Now we delete old images associated with the url cache.
|
|
|
|
# These may be cached for a bit on the client (i.e., they
|
|
|
|
# may have a room open with a preview url thing open).
|
|
|
|
# So we wait a couple of days before deleting, just in case.
|
|
|
|
expire_before = now - IMAGE_CACHE_EXPIRY_MS
|
|
|
|
media_ids = await self.store.get_url_cache_media_before(expire_before)
|
|
|
|
|
|
|
|
removed_media = []
|
|
|
|
for media_id in media_ids:
|
|
|
|
fname = self.filepaths.url_cache_filepath(media_id)
|
|
|
|
try:
|
|
|
|
os.remove(fname)
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass # If the path doesn't exist, meh
|
|
|
|
except OSError as e:
|
|
|
|
logger.warning(
|
|
|
|
"Failed to remove media from url preview cache: %r: %s", media_id, e
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
|
|
|
|
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
|
|
|
try_remove_parent_dirs(dirs)
|
|
|
|
|
|
|
|
thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
|
|
|
|
try:
|
|
|
|
shutil.rmtree(thumbnail_dir)
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass # If the path doesn't exist, meh
|
|
|
|
except OSError as e:
|
|
|
|
logger.warning(
|
|
|
|
"Failed to remove media from url preview cache: %r: %s", media_id, e
|
|
|
|
)
|
|
|
|
continue
|
|
|
|
|
|
|
|
removed_media.append(media_id)
|
|
|
|
|
|
|
|
dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
|
|
|
|
# Note that one of the directories to be deleted has already been
|
|
|
|
# removed by the `rmtree` above.
|
|
|
|
try_remove_parent_dirs(dirs)
|
|
|
|
|
|
|
|
await self.store.delete_url_cache_media(removed_media)
|
|
|
|
|
|
|
|
if removed_media:
|
|
|
|
logger.debug("Deleted %d media from url preview cache", len(removed_media))
|
|
|
|
else:
|
|
|
|
logger.debug("No media removed from url preview cache")
|
|
|
|
|
|
|
|
|
|
|
|
def _is_media(content_type: str) -> bool:
|
|
|
|
return content_type.lower().startswith("image/")
|
|
|
|
|
|
|
|
|
|
|
|
def _is_html(content_type: str) -> bool:
|
|
|
|
content_type = content_type.lower()
|
2023-09-08 11:24:36 -04:00
|
|
|
return content_type.startswith(("text/html", "application/xhtml"))
|
2023-03-20 14:32:26 -04:00
|
|
|
|
|
|
|
|
|
|
|
def _is_json(content_type: str) -> bool:
|
|
|
|
return content_type.lower().startswith("application/json")
|
|
|
|
|
|
|
|
|
|
|
|
def _is_previewable(content_type: str) -> bool:
|
|
|
|
"""Returns True for content types for which we will perform URL preview and False
|
|
|
|
otherwise."""
|
|
|
|
|
|
|
|
return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
|