2016-01-24 18:47:27 -05:00
|
|
|
# Copyright 2016 OpenMarket Ltd
|
2021-01-15 10:57:37 -05:00
|
|
|
# Copyright 2020-2021 The Matrix.org Foundation C.I.C.
|
2016-01-24 18:47:27 -05:00
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
2018-02-01 19:35:18 -05:00
|
|
|
import datetime
|
|
|
|
import errno
|
|
|
|
import fnmatch
|
|
|
|
import logging
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import shutil
|
|
|
|
import sys
|
|
|
|
import traceback
|
2022-01-24 08:58:18 -05:00
|
|
|
from typing import TYPE_CHECKING, BinaryIO, Iterable, Optional, Tuple
|
2020-06-16 08:51:47 -04:00
|
|
|
from urllib import parse as urlparse
|
2022-01-24 08:58:18 -05:00
|
|
|
from urllib.request import urlopen
|
2018-07-09 02:09:20 -04:00
|
|
|
|
2021-09-07 09:10:34 -04:00
|
|
|
import attr
|
|
|
|
|
2021-09-15 08:45:32 -04:00
|
|
|
from twisted.internet.defer import Deferred
|
2019-05-10 13:32:44 -04:00
|
|
|
from twisted.internet.error import DNSLookupError
|
2016-04-03 07:56:29 -04:00
|
|
|
|
2018-07-09 02:09:20 -04:00
|
|
|
from synapse.api.errors import Codes, SynapseError
|
2018-12-21 09:56:13 -05:00
|
|
|
from synapse.http.client import SimpleHttpClient
|
2016-04-03 07:56:29 -04:00
|
|
|
from synapse.http.server import (
|
2020-07-03 14:02:19 -04:00
|
|
|
DirectServeJsonResource,
|
2017-11-23 12:52:31 -05:00
|
|
|
respond_with_json,
|
2018-07-09 02:09:20 -04:00
|
|
|
respond_with_json_bytes,
|
2016-04-03 07:56:29 -04:00
|
|
|
)
|
2018-07-13 15:40:14 -04:00
|
|
|
from synapse.http.servlet import parse_integer, parse_string
|
2021-03-12 11:37:57 -05:00
|
|
|
from synapse.http.site import SynapseRequest
|
2019-07-03 10:07:04 -04:00
|
|
|
from synapse.logging.context import make_deferred_yieldable, run_in_background
|
2018-07-25 04:41:12 -04:00
|
|
|
from synapse.metrics.background_process_metrics import run_as_background_process
|
2018-11-15 16:55:58 -05:00
|
|
|
from synapse.rest.media.v1._base import get_filename_from_headers
|
2021-01-15 10:57:37 -05:00
|
|
|
from synapse.rest.media.v1.media_storage import MediaStorage
|
2021-09-21 12:09:57 -04:00
|
|
|
from synapse.rest.media.v1.oembed import OEmbedProvider
|
2021-12-13 12:55:07 -05:00
|
|
|
from synapse.rest.media.v1.preview_html import (
|
|
|
|
decode_body,
|
|
|
|
parse_html_to_open_graph,
|
|
|
|
rebase_url,
|
|
|
|
)
|
2021-11-12 11:05:26 -05:00
|
|
|
from synapse.types import JsonDict, UserID
|
2020-08-07 08:02:55 -04:00
|
|
|
from synapse.util import json_encoder
|
2018-08-10 09:50:21 -04:00
|
|
|
from synapse.util.async_helpers import ObservableDeferred
|
2018-07-09 02:09:20 -04:00
|
|
|
from synapse.util.caches.expiringcache import ExpiringCache
|
2018-11-15 16:55:58 -05:00
|
|
|
from synapse.util.stringutils import random_string
|
2018-07-09 02:09:20 -04:00
|
|
|
|
|
|
|
from ._base import FileInfo
|
2016-03-28 22:13:25 -04:00
|
|
|
|
2021-01-15 10:57:37 -05:00
|
|
|
if TYPE_CHECKING:
|
|
|
|
from synapse.rest.media.v1.media_repository import MediaRepository
|
2021-03-23 07:12:48 -04:00
|
|
|
from synapse.server import HomeServer
|
2021-01-15 10:57:37 -05:00
|
|
|
|
2016-01-24 18:47:27 -05:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2019-11-05 10:45:17 -05:00
|
|
|
OG_TAG_NAME_MAXLEN = 50
|
|
|
|
OG_TAG_VALUE_MAXLEN = 1000
|
|
|
|
|
2020-07-27 07:50:44 -04:00
|
|
|
ONE_HOUR = 60 * 60 * 1000
|
2021-09-21 12:09:57 -04:00
|
|
|
ONE_DAY = 24 * ONE_HOUR
|
2021-09-29 05:24:37 -04:00
|
|
|
IMAGE_CACHE_EXPIRY_MS = 2 * ONE_DAY
|
2020-07-27 07:50:44 -04:00
|
|
|
|
2016-04-03 07:56:29 -04:00
|
|
|
|
2022-01-24 08:58:18 -05:00
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
|
|
|
class DownloadResult:
|
|
|
|
length: int
|
|
|
|
uri: str
|
|
|
|
response_code: int
|
|
|
|
media_type: str
|
|
|
|
download_name: Optional[str]
|
|
|
|
expires: int
|
|
|
|
etag: Optional[str]
|
|
|
|
|
|
|
|
|
2021-09-07 09:10:34 -04:00
|
|
|
@attr.s(slots=True, frozen=True, auto_attribs=True)
|
|
|
|
class MediaInfo:
|
|
|
|
"""
|
|
|
|
Information parsed from downloading media being previewed.
|
|
|
|
"""
|
|
|
|
|
|
|
|
# The Content-Type header of the response.
|
|
|
|
media_type: str
|
|
|
|
# The length (in bytes) of the downloaded media.
|
|
|
|
media_length: int
|
|
|
|
# The media filename, according to the server. This is parsed from the
|
|
|
|
# returned headers, if possible.
|
|
|
|
download_name: Optional[str]
|
|
|
|
# The time of the preview.
|
|
|
|
created_ts_ms: int
|
|
|
|
# Information from the media storage provider about where the file is stored
|
|
|
|
# on disk.
|
|
|
|
filesystem_id: str
|
|
|
|
filename: str
|
|
|
|
# The URI being previewed.
|
|
|
|
uri: str
|
|
|
|
# The HTTP response code.
|
|
|
|
response_code: int
|
|
|
|
# The timestamp (in milliseconds) of when this preview expires.
|
|
|
|
expires: int
|
|
|
|
# The ETag header of the response.
|
|
|
|
etag: Optional[str]
|
|
|
|
|
|
|
|
|
2020-07-03 14:02:19 -04:00
|
|
|
class PreviewUrlResource(DirectServeJsonResource):
|
2021-09-07 09:10:34 -04:00
|
|
|
"""
|
|
|
|
Generating URL previews is a complicated task which many potential pitfalls.
|
|
|
|
|
|
|
|
See docs/development/url_previews.md for discussion of the design and
|
|
|
|
algorithm followed in this module.
|
|
|
|
"""
|
|
|
|
|
2016-01-24 18:47:27 -05:00
|
|
|
isLeaf = True
|
|
|
|
|
2021-01-15 10:57:37 -05:00
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
hs: "HomeServer",
|
|
|
|
media_repo: "MediaRepository",
|
|
|
|
media_storage: MediaStorage,
|
|
|
|
):
|
2019-06-29 03:06:55 -04:00
|
|
|
super().__init__()
|
2016-04-19 09:45:05 -04:00
|
|
|
|
|
|
|
self.auth = hs.get_auth()
|
|
|
|
self.clock = hs.get_clock()
|
|
|
|
self.filepaths = media_repo.filepaths
|
2021-09-24 07:25:21 -04:00
|
|
|
self.max_spider_size = hs.config.media.max_spider_size
|
2016-04-19 09:45:05 -04:00
|
|
|
self.server_name = hs.hostname
|
2016-04-19 09:48:24 -04:00
|
|
|
self.store = hs.get_datastore()
|
2018-12-21 09:56:13 -05:00
|
|
|
self.client = SimpleHttpClient(
|
|
|
|
hs,
|
|
|
|
treq_args={"browser_like_redirects": True},
|
2021-09-24 07:25:21 -04:00
|
|
|
ip_whitelist=hs.config.media.url_preview_ip_range_whitelist,
|
|
|
|
ip_blacklist=hs.config.media.url_preview_ip_range_blacklist,
|
2021-02-26 12:37:57 -05:00
|
|
|
use_proxy=True,
|
2018-12-21 09:56:13 -05:00
|
|
|
)
|
2016-04-19 09:51:34 -04:00
|
|
|
self.media_repo = media_repo
|
2017-10-12 12:31:24 -04:00
|
|
|
self.primary_base_path = media_repo.primary_base_path
|
2018-01-09 09:36:07 -05:00
|
|
|
self.media_storage = media_storage
|
2016-04-19 09:45:05 -04:00
|
|
|
|
2021-10-13 07:00:07 -04:00
|
|
|
self._oembed = OEmbedProvider(hs)
|
2021-08-31 18:37:07 -04:00
|
|
|
|
2020-06-17 09:13:30 -04:00
|
|
|
# We run the background jobs if we're the instance specified (or no
|
|
|
|
# instance is specified, where we assume there is only one instance
|
|
|
|
# serving media).
|
|
|
|
instance_running_jobs = hs.config.media.media_instance_running_background_jobs
|
|
|
|
self._worker_run_media_background_jobs = (
|
|
|
|
instance_running_jobs is None
|
|
|
|
or instance_running_jobs == hs.get_instance_name()
|
|
|
|
)
|
|
|
|
|
2021-09-24 07:25:21 -04:00
|
|
|
self.url_preview_url_blacklist = hs.config.media.url_preview_url_blacklist
|
|
|
|
self.url_preview_accept_language = hs.config.media.url_preview_accept_language
|
2016-04-02 19:31:57 -04:00
|
|
|
|
2017-11-10 11:34:33 -05:00
|
|
|
# memory cache mapping urls to an ObservableDeferred returning
|
|
|
|
# JSON-encoded OG metadata
|
2021-07-16 13:22:36 -04:00
|
|
|
self._cache: ExpiringCache[str, ObservableDeferred] = ExpiringCache(
|
2016-04-03 07:56:29 -04:00
|
|
|
cache_name="url_previews",
|
|
|
|
clock=self.clock,
|
|
|
|
# don't spider URLs more often than once an hour
|
2020-07-27 07:50:44 -04:00
|
|
|
expiry_ms=ONE_HOUR,
|
2021-07-16 13:22:36 -04:00
|
|
|
)
|
2016-04-02 19:47:40 -04:00
|
|
|
|
2020-06-17 09:13:30 -04:00
|
|
|
if self._worker_run_media_background_jobs:
|
|
|
|
self._cleaner_loop = self.clock.looping_call(
|
|
|
|
self._start_expire_url_cache_data, 10 * 1000
|
|
|
|
)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
2021-09-24 06:01:25 -04:00
|
|
|
async def _async_render_OPTIONS(self, request: SynapseRequest) -> None:
|
2019-07-02 14:01:28 -04:00
|
|
|
request.setHeader(b"Allow", b"OPTIONS, GET")
|
2020-07-03 14:02:19 -04:00
|
|
|
respond_with_json(request, 200, {}, send_cors=True)
|
2017-11-23 12:52:31 -05:00
|
|
|
|
2021-03-12 11:37:57 -05:00
|
|
|
async def _async_render_GET(self, request: SynapseRequest) -> None:
|
2016-04-11 05:39:16 -04:00
|
|
|
# XXX: if get_user_by_req fails, what should we do in an async render?
|
2019-06-29 03:06:55 -04:00
|
|
|
requester = await self.auth.get_user_by_req(request)
|
2021-07-21 09:47:56 -04:00
|
|
|
url = parse_string(request, "url", required=True)
|
|
|
|
ts = parse_integer(request, "ts")
|
|
|
|
if ts is None:
|
2016-04-11 05:39:16 -04:00
|
|
|
ts = self.clock.time_msec()
|
|
|
|
|
2017-11-10 11:34:33 -05:00
|
|
|
# XXX: we could move this into _do_preview if we wanted.
|
2016-05-16 08:03:59 -04:00
|
|
|
url_tuple = urlparse.urlsplit(url)
|
|
|
|
for entry in self.url_preview_url_blacklist:
|
|
|
|
match = True
|
|
|
|
for attrib in entry:
|
|
|
|
pattern = entry[attrib]
|
|
|
|
value = getattr(url_tuple, attrib)
|
|
|
|
logger.debug(
|
2019-11-21 07:00:14 -05:00
|
|
|
"Matching attrib '%s' with value '%s' against pattern '%s'",
|
2019-10-24 13:43:13 -04:00
|
|
|
attrib,
|
|
|
|
value,
|
|
|
|
pattern,
|
2016-05-16 08:03:59 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
if value is None:
|
|
|
|
match = False
|
|
|
|
continue
|
|
|
|
|
|
|
|
if pattern.startswith("^"):
|
|
|
|
if not re.match(pattern, getattr(url_tuple, attrib)):
|
2016-04-11 05:39:16 -04:00
|
|
|
match = False
|
|
|
|
continue
|
2016-05-16 08:03:59 -04:00
|
|
|
else:
|
|
|
|
if not fnmatch.fnmatch(getattr(url_tuple, attrib), pattern):
|
|
|
|
match = False
|
|
|
|
continue
|
|
|
|
if match:
|
2019-10-31 06:23:24 -04:00
|
|
|
logger.warning("URL %s blocked by url_blacklist entry %s", url, entry)
|
2016-05-16 08:03:59 -04:00
|
|
|
raise SynapseError(
|
|
|
|
403, "URL blocked by url pattern blacklist entry", Codes.UNKNOWN
|
|
|
|
)
|
2016-04-11 05:39:16 -04:00
|
|
|
|
2017-11-10 11:34:33 -05:00
|
|
|
# the in-memory cache:
|
|
|
|
# * ensures that only one request is active at a time
|
|
|
|
# * takes load off the DB for the thundering herds
|
|
|
|
# * also caches any failures (unlike the DB) so we don't keep
|
|
|
|
# requesting the same endpoint
|
|
|
|
|
|
|
|
observable = self._cache.get(url)
|
|
|
|
|
|
|
|
if not observable:
|
2018-04-27 06:29:27 -04:00
|
|
|
download = run_in_background(self._do_preview, url, requester.user, ts)
|
2017-11-10 11:34:33 -05:00
|
|
|
observable = ObservableDeferred(download, consumeErrors=True)
|
|
|
|
self._cache[url] = observable
|
2017-11-10 11:58:04 -05:00
|
|
|
else:
|
|
|
|
logger.info("Returning cached response")
|
2016-04-11 05:39:16 -04:00
|
|
|
|
2020-08-07 09:44:48 -04:00
|
|
|
og = await make_deferred_yieldable(observable.observe())
|
2017-11-10 11:34:33 -05:00
|
|
|
respond_with_json_bytes(request, 200, og, send_cors=True)
|
|
|
|
|
2021-11-12 11:05:26 -05:00
|
|
|
async def _do_preview(self, url: str, user: UserID, ts: int) -> bytes:
|
2017-11-10 11:34:33 -05:00
|
|
|
"""Check the db, and download the URL and build a preview
|
|
|
|
|
|
|
|
Args:
|
2020-07-27 14:40:11 -04:00
|
|
|
url: The URL to preview.
|
|
|
|
user: The user requesting the preview.
|
|
|
|
ts: The timestamp requested for the preview.
|
2017-11-10 11:34:33 -05:00
|
|
|
|
|
|
|
Returns:
|
2020-07-27 14:40:11 -04:00
|
|
|
json-encoded og data
|
2017-11-10 11:34:33 -05:00
|
|
|
"""
|
|
|
|
# check the URL cache in the DB (which will also provide us with
|
2016-04-11 05:39:16 -04:00
|
|
|
# historical previews, if we have any)
|
2020-03-20 07:20:02 -04:00
|
|
|
cache_result = await self.store.get_url_cache(url, ts)
|
2016-04-11 05:39:16 -04:00
|
|
|
if (
|
|
|
|
cache_result
|
2017-09-28 07:37:53 -04:00
|
|
|
and cache_result["expires_ts"] > ts
|
2016-04-11 05:39:16 -04:00
|
|
|
and cache_result["response_code"] / 100 == 2
|
|
|
|
):
|
2018-11-07 09:37:43 -05:00
|
|
|
# It may be stored as text in the database, not as bytes (such as
|
|
|
|
# PostgreSQL). If so, encode it back before handing it on.
|
|
|
|
og = cache_result["og"]
|
2020-06-16 08:51:47 -04:00
|
|
|
if isinstance(og, str):
|
2018-11-07 09:37:43 -05:00
|
|
|
og = og.encode("utf8")
|
2019-07-23 09:00:55 -04:00
|
|
|
return og
|
2016-04-11 05:39:16 -04:00
|
|
|
|
2021-09-21 12:09:57 -04:00
|
|
|
# If this URL can be accessed via oEmbed, use that instead.
|
|
|
|
url_to_download = url
|
|
|
|
oembed_url = self._oembed.get_oembed_url(url)
|
|
|
|
if oembed_url:
|
|
|
|
url_to_download = oembed_url
|
|
|
|
|
2022-01-24 08:58:18 -05:00
|
|
|
media_info = await self._handle_url(url_to_download, user)
|
2016-04-11 05:39:16 -04:00
|
|
|
|
2019-10-24 13:31:53 -04:00
|
|
|
logger.debug("got media_info of '%s'", media_info)
|
2016-04-11 05:39:16 -04:00
|
|
|
|
2021-09-21 12:09:57 -04:00
|
|
|
# The number of milliseconds that the response should be considered valid.
|
|
|
|
expiration_ms = media_info.expires
|
2022-01-18 13:20:24 -05:00
|
|
|
author_name: Optional[str] = None
|
2021-09-21 12:09:57 -04:00
|
|
|
|
2021-09-07 09:10:34 -04:00
|
|
|
if _is_media(media_info.media_type):
|
|
|
|
file_id = media_info.filesystem_id
|
2020-03-20 07:20:02 -04:00
|
|
|
dims = await self.media_repo._generate_thumbnails(
|
2021-09-07 09:10:34 -04:00
|
|
|
None, file_id, file_id, media_info.media_type, url_cache=True
|
2016-04-02 19:31:57 -04:00
|
|
|
)
|
2016-03-31 09:15:09 -04:00
|
|
|
|
2016-04-11 05:39:16 -04:00
|
|
|
og = {
|
2021-09-07 09:10:34 -04:00
|
|
|
"og:description": media_info.download_name,
|
|
|
|
"og:image": f"mxc://{self.server_name}/{media_info.filesystem_id}",
|
|
|
|
"og:image:type": media_info.media_type,
|
|
|
|
"matrix:image:size": media_info.media_length,
|
2016-04-11 05:39:16 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
if dims:
|
|
|
|
og["og:image:width"] = dims["width"]
|
|
|
|
og["og:image:height"] = dims["height"]
|
|
|
|
else:
|
2019-10-31 06:23:24 -04:00
|
|
|
logger.warning("Couldn't get dims for %s" % url)
|
2016-04-11 05:39:16 -04:00
|
|
|
|
|
|
|
# define our OG response for this media
|
2021-09-07 09:10:34 -04:00
|
|
|
elif _is_html(media_info.media_type):
|
2016-04-11 05:39:16 -04:00
|
|
|
# TODO: somehow stop a big HTML tree from exploding synapse's RAM
|
|
|
|
|
2021-09-07 09:10:34 -04:00
|
|
|
with open(media_info.filename, "rb") as file:
|
2018-11-07 09:37:43 -05:00
|
|
|
body = file.read()
|
2016-04-15 09:32:25 -04:00
|
|
|
|
2021-10-14 10:17:20 -04:00
|
|
|
tree = decode_body(body, media_info.uri, media_info.media_type)
|
2021-10-08 14:14:42 -04:00
|
|
|
if tree is not None:
|
|
|
|
# Check if this HTML document points to oEmbed information and
|
|
|
|
# defer to that.
|
|
|
|
oembed_url = self._oembed.autodiscover_from_html(tree)
|
2022-01-18 13:20:24 -05:00
|
|
|
og_from_oembed: JsonDict = {}
|
2021-10-08 14:14:42 -04:00
|
|
|
if oembed_url:
|
2022-01-24 08:58:18 -05:00
|
|
|
oembed_info = await self._handle_url(
|
|
|
|
oembed_url, user, allow_data_urls=True
|
|
|
|
)
|
2022-01-18 13:20:24 -05:00
|
|
|
(
|
|
|
|
og_from_oembed,
|
|
|
|
author_name,
|
|
|
|
expiration_ms,
|
|
|
|
) = await self._handle_oembed_response(
|
2021-10-08 14:14:42 -04:00
|
|
|
url, oembed_info, expiration_ms
|
|
|
|
)
|
|
|
|
|
2022-01-18 13:20:24 -05:00
|
|
|
# Parse Open Graph information from the HTML in case the oEmbed
|
|
|
|
# response failed or is incomplete.
|
|
|
|
og_from_html = parse_html_to_open_graph(tree, media_info.uri)
|
|
|
|
|
|
|
|
# Compile the Open Graph response by using the scraped
|
|
|
|
# information from the HTML and overlaying any information
|
|
|
|
# from the oEmbed response.
|
|
|
|
og = {**og_from_html, **og_from_oembed}
|
2021-10-08 14:14:42 -04:00
|
|
|
|
|
|
|
await self._precache_image_url(user, media_info, og)
|
|
|
|
else:
|
|
|
|
og = {}
|
2021-09-21 12:09:57 -04:00
|
|
|
|
2021-10-08 14:14:42 -04:00
|
|
|
elif oembed_url:
|
|
|
|
# Handle the oEmbed information.
|
2022-01-18 13:20:24 -05:00
|
|
|
og, author_name, expiration_ms = await self._handle_oembed_response(
|
2021-10-08 14:14:42 -04:00
|
|
|
url, media_info, expiration_ms
|
|
|
|
)
|
2021-09-21 12:09:57 -04:00
|
|
|
await self._precache_image_url(user, media_info, og)
|
2016-08-16 09:53:18 -04:00
|
|
|
|
2016-04-11 05:39:16 -04:00
|
|
|
else:
|
2019-10-31 06:23:24 -04:00
|
|
|
logger.warning("Failed to find any OG data in %s", url)
|
2016-04-11 05:39:16 -04:00
|
|
|
og = {}
|
|
|
|
|
2022-01-18 13:20:24 -05:00
|
|
|
# If we don't have a title but we have author_name, copy it as
|
|
|
|
# title
|
|
|
|
if not og.get("og:title") and author_name:
|
|
|
|
og["og:title"] = author_name
|
|
|
|
|
2019-11-05 10:45:17 -05:00
|
|
|
# filter out any stupidly long values
|
|
|
|
keys_to_remove = []
|
|
|
|
for k, v in og.items():
|
2019-11-05 12:22:58 -05:00
|
|
|
# values can be numeric as well as strings, hence the cast to str
|
|
|
|
if len(k) > OG_TAG_NAME_MAXLEN or len(str(v)) > OG_TAG_VALUE_MAXLEN:
|
2019-11-05 10:45:17 -05:00
|
|
|
logger.warning(
|
|
|
|
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
|
|
|
|
)
|
|
|
|
keys_to_remove.append(k)
|
|
|
|
for k in keys_to_remove:
|
|
|
|
del og[k]
|
|
|
|
|
2019-10-24 13:31:53 -04:00
|
|
|
logger.debug("Calculated OG for %s as %s", url, og)
|
2016-04-11 05:39:16 -04:00
|
|
|
|
2020-08-07 08:02:55 -04:00
|
|
|
jsonog = json_encoder.encode(og)
|
2016-04-11 05:39:16 -04:00
|
|
|
|
2021-09-21 12:09:57 -04:00
|
|
|
# Cap the amount of time to consider a response valid.
|
|
|
|
expiration_ms = min(expiration_ms, ONE_DAY)
|
|
|
|
|
2016-04-11 05:39:16 -04:00
|
|
|
# store OG in history-aware DB cache
|
2020-03-20 07:20:02 -04:00
|
|
|
await self.store.store_url_cache(
|
2016-04-11 05:39:16 -04:00
|
|
|
url,
|
2021-09-07 09:10:34 -04:00
|
|
|
media_info.response_code,
|
|
|
|
media_info.etag,
|
2021-09-21 12:09:57 -04:00
|
|
|
media_info.created_ts_ms + expiration_ms,
|
2017-11-10 11:34:33 -05:00
|
|
|
jsonog,
|
2021-09-07 09:10:34 -04:00
|
|
|
media_info.filesystem_id,
|
|
|
|
media_info.created_ts_ms,
|
2016-04-11 05:39:16 -04:00
|
|
|
)
|
|
|
|
|
2019-10-10 09:52:29 -04:00
|
|
|
return jsonog.encode("utf8")
|
2016-04-08 16:36:48 -04:00
|
|
|
|
2022-01-24 08:58:18 -05:00
|
|
|
async def _download_url(self, url: str, output_stream: BinaryIO) -> DownloadResult:
|
|
|
|
"""
|
|
|
|
Fetches a remote URL and parses the headers.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL to fetch.
|
|
|
|
output_stream: The stream to write the content to.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A tuple of:
|
|
|
|
Media length, URL downloaded, the HTTP response code,
|
|
|
|
the media type, the downloaded file name, the number of
|
|
|
|
milliseconds the result is valid for, the etag header.
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
logger.debug("Trying to get preview for url '%s'", url)
|
|
|
|
length, headers, uri, code = await self.client.get_file(
|
|
|
|
url,
|
|
|
|
output_stream=output_stream,
|
|
|
|
max_size=self.max_spider_size,
|
|
|
|
headers={"Accept-Language": self.url_preview_accept_language},
|
2022-02-10 10:43:01 -05:00
|
|
|
is_allowed_content_type=_is_previewable,
|
2022-01-24 08:58:18 -05:00
|
|
|
)
|
|
|
|
except SynapseError:
|
|
|
|
# Pass SynapseErrors through directly, so that the servlet
|
|
|
|
# handler will return a SynapseError to the client instead of
|
|
|
|
# blank data or a 500.
|
|
|
|
raise
|
|
|
|
except DNSLookupError:
|
|
|
|
# DNS lookup returned no results
|
|
|
|
# Note: This will also be the case if one of the resolved IP
|
|
|
|
# addresses is blacklisted
|
|
|
|
raise SynapseError(
|
|
|
|
502,
|
|
|
|
"DNS resolution failure during URL preview generation",
|
|
|
|
Codes.UNKNOWN,
|
|
|
|
)
|
|
|
|
except Exception as e:
|
|
|
|
# FIXME: pass through 404s and other error messages nicely
|
|
|
|
logger.warning("Error downloading %s: %r", url, e)
|
|
|
|
|
|
|
|
raise SynapseError(
|
|
|
|
500,
|
|
|
|
"Failed to download content: %s"
|
|
|
|
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
|
|
|
Codes.UNKNOWN,
|
|
|
|
)
|
|
|
|
|
|
|
|
if b"Content-Type" in headers:
|
|
|
|
media_type = headers[b"Content-Type"][0].decode("ascii")
|
|
|
|
else:
|
|
|
|
media_type = "application/octet-stream"
|
|
|
|
|
|
|
|
download_name = get_filename_from_headers(headers)
|
|
|
|
|
|
|
|
# FIXME: we should calculate a proper expiration based on the
|
|
|
|
# Cache-Control and Expire headers. But for now, assume 1 hour.
|
|
|
|
expires = ONE_HOUR
|
|
|
|
etag = headers[b"ETag"][0].decode("ascii") if b"ETag" in headers else None
|
|
|
|
|
|
|
|
return DownloadResult(
|
|
|
|
length, uri, code, media_type, download_name, expires, etag
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _parse_data_url(
|
|
|
|
self, url: str, output_stream: BinaryIO
|
|
|
|
) -> DownloadResult:
|
|
|
|
"""
|
|
|
|
Parses a data: URL.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL to parse.
|
|
|
|
output_stream: The stream to write the content to.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A tuple of:
|
|
|
|
Media length, URL downloaded, the HTTP response code,
|
|
|
|
the media type, the downloaded file name, the number of
|
|
|
|
milliseconds the result is valid for, the etag header.
|
|
|
|
"""
|
|
|
|
|
|
|
|
try:
|
|
|
|
logger.debug("Trying to parse data url '%s'", url)
|
|
|
|
with urlopen(url) as url_info:
|
|
|
|
# TODO Can this be more efficient.
|
|
|
|
output_stream.write(url_info.read())
|
|
|
|
except Exception as e:
|
|
|
|
logger.warning("Error parsing data: URL %s: %r", url, e)
|
|
|
|
|
|
|
|
raise SynapseError(
|
|
|
|
500,
|
|
|
|
"Failed to parse data URL: %s"
|
|
|
|
% (traceback.format_exception_only(sys.exc_info()[0], e),),
|
|
|
|
Codes.UNKNOWN,
|
|
|
|
)
|
|
|
|
|
|
|
|
return DownloadResult(
|
|
|
|
# Read back the length that has been written.
|
|
|
|
length=output_stream.tell(),
|
|
|
|
uri=url,
|
|
|
|
# If it was parsed, consider this a 200 OK.
|
|
|
|
response_code=200,
|
|
|
|
# urlopen shoves the media-type from the data URL into the content type
|
|
|
|
# header object.
|
|
|
|
media_type=url_info.headers.get_content_type(),
|
|
|
|
# Some features are not supported by data: URLs.
|
|
|
|
download_name=None,
|
|
|
|
expires=ONE_HOUR,
|
|
|
|
etag=None,
|
|
|
|
)
|
|
|
|
|
|
|
|
async def _handle_url(
|
|
|
|
self, url: str, user: UserID, allow_data_urls: bool = False
|
|
|
|
) -> MediaInfo:
|
|
|
|
"""
|
|
|
|
Fetches content from a URL and parses the result to generate a MediaInfo.
|
|
|
|
|
|
|
|
It uses the media storage provider to persist the fetched content and
|
|
|
|
stores the mapping into the database.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL to fetch.
|
|
|
|
user: The user who ahs requested this URL.
|
|
|
|
allow_data_urls: True if data URLs should be allowed.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A MediaInfo object describing the fetched content.
|
|
|
|
"""
|
|
|
|
|
2016-03-31 09:15:09 -04:00
|
|
|
# TODO: we should probably honour robots.txt... except in practice
|
|
|
|
# we're most likely being explicitly triggered by a human rather than a
|
|
|
|
# bot, so are we really a robot?
|
|
|
|
|
2017-09-28 07:18:06 -04:00
|
|
|
file_id = datetime.date.today().isoformat() + "_" + random_string(16)
|
2016-01-24 18:47:27 -05:00
|
|
|
|
2018-01-09 09:36:07 -05:00
|
|
|
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
|
2016-01-24 18:47:27 -05:00
|
|
|
|
2021-09-21 12:09:57 -04:00
|
|
|
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
|
2022-01-24 08:58:18 -05:00
|
|
|
if url.startswith("data:"):
|
|
|
|
if not allow_data_urls:
|
|
|
|
raise SynapseError(
|
|
|
|
500, "Previewing of data: URLs is forbidden", Codes.UNKNOWN
|
|
|
|
)
|
2020-07-27 07:50:44 -04:00
|
|
|
|
2022-01-24 08:58:18 -05:00
|
|
|
download_result = await self._parse_data_url(url, f)
|
2021-09-21 12:09:57 -04:00
|
|
|
else:
|
2022-01-24 08:58:18 -05:00
|
|
|
download_result = await self._download_url(url, f)
|
2021-09-21 12:09:57 -04:00
|
|
|
|
2022-01-24 08:58:18 -05:00
|
|
|
await finish()
|
2016-04-01 22:06:39 -04:00
|
|
|
|
2018-02-01 19:35:18 -05:00
|
|
|
try:
|
2016-01-24 18:47:27 -05:00
|
|
|
time_now_ms = self.clock.time_msec()
|
|
|
|
|
2020-03-20 07:20:02 -04:00
|
|
|
await self.store.store_local_media(
|
2016-03-31 21:17:48 -04:00
|
|
|
media_id=file_id,
|
2022-01-24 08:58:18 -05:00
|
|
|
media_type=download_result.media_type,
|
2020-07-27 07:50:44 -04:00
|
|
|
time_now_ms=time_now_ms,
|
2022-01-24 08:58:18 -05:00
|
|
|
upload_name=download_result.download_name,
|
|
|
|
media_length=download_result.length,
|
2016-03-28 22:13:25 -04:00
|
|
|
user_id=user,
|
2017-06-23 06:14:11 -04:00
|
|
|
url_cache=url,
|
2016-01-24 18:47:27 -05:00
|
|
|
)
|
|
|
|
|
2016-04-08 16:36:48 -04:00
|
|
|
except Exception as e:
|
2018-02-01 19:35:18 -05:00
|
|
|
logger.error("Error handling downloaded %s: %r", url, e)
|
|
|
|
# TODO: we really ought to delete the downloaded file in this
|
|
|
|
# case, since we won't have recorded it in the db, and will
|
|
|
|
# therefore not expire it.
|
|
|
|
raise
|
2016-01-24 18:47:27 -05:00
|
|
|
|
2021-09-07 09:10:34 -04:00
|
|
|
return MediaInfo(
|
2022-01-24 08:58:18 -05:00
|
|
|
media_type=download_result.media_type,
|
|
|
|
media_length=download_result.length,
|
|
|
|
download_name=download_result.download_name,
|
2021-09-07 09:10:34 -04:00
|
|
|
created_ts_ms=time_now_ms,
|
|
|
|
filesystem_id=file_id,
|
|
|
|
filename=fname,
|
2022-01-24 08:58:18 -05:00
|
|
|
uri=download_result.uri,
|
|
|
|
response_code=download_result.response_code,
|
|
|
|
expires=download_result.expires,
|
|
|
|
etag=download_result.etag,
|
2021-09-07 09:10:34 -04:00
|
|
|
)
|
2016-01-24 18:47:27 -05:00
|
|
|
|
2021-09-21 12:09:57 -04:00
|
|
|
async def _precache_image_url(
|
2021-11-12 11:05:26 -05:00
|
|
|
self, user: UserID, media_info: MediaInfo, og: JsonDict
|
2021-09-21 12:09:57 -04:00
|
|
|
) -> None:
|
|
|
|
"""
|
|
|
|
Pre-cache the image (if one exists) for posterity
|
|
|
|
|
|
|
|
Args:
|
|
|
|
user: The user requesting the preview.
|
|
|
|
media_info: The media being previewed.
|
|
|
|
og: The Open Graph dictionary. This is modified with image information.
|
|
|
|
"""
|
|
|
|
# If there's no image or it is blank, there's nothing to do.
|
|
|
|
if "og:image" not in og or not og["og:image"]:
|
|
|
|
return
|
|
|
|
|
|
|
|
# FIXME: it might be cleaner to use the same flow as the main /preview_url
|
|
|
|
# request itself and benefit from the same caching etc. But for now we
|
|
|
|
# just rely on the caching on the master request to speed things up.
|
2022-01-24 08:58:18 -05:00
|
|
|
image_info = await self._handle_url(
|
|
|
|
rebase_url(og["og:image"], media_info.uri), user, allow_data_urls=True
|
2021-09-21 12:09:57 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
if _is_media(image_info.media_type):
|
|
|
|
# TODO: make sure we don't choke on white-on-transparent images
|
|
|
|
file_id = image_info.filesystem_id
|
|
|
|
dims = await self.media_repo._generate_thumbnails(
|
|
|
|
None, file_id, file_id, image_info.media_type, url_cache=True
|
|
|
|
)
|
|
|
|
if dims:
|
|
|
|
og["og:image:width"] = dims["width"]
|
|
|
|
og["og:image:height"] = dims["height"]
|
|
|
|
else:
|
|
|
|
logger.warning("Couldn't get dims for %s", og["og:image"])
|
|
|
|
|
|
|
|
og["og:image"] = f"mxc://{self.server_name}/{image_info.filesystem_id}"
|
|
|
|
og["og:image:type"] = image_info.media_type
|
|
|
|
og["matrix:image:size"] = image_info.media_length
|
|
|
|
else:
|
|
|
|
del og["og:image"]
|
|
|
|
|
2021-10-08 14:14:42 -04:00
|
|
|
async def _handle_oembed_response(
|
|
|
|
self, url: str, media_info: MediaInfo, expiration_ms: int
|
2022-01-18 13:20:24 -05:00
|
|
|
) -> Tuple[JsonDict, Optional[str], int]:
|
2021-10-08 14:14:42 -04:00
|
|
|
"""
|
|
|
|
Parse the downloaded oEmbed info.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
url: The URL which is being previewed (not the one which was
|
|
|
|
requested).
|
|
|
|
media_info: The media being previewed.
|
|
|
|
expiration_ms: The length of time, in milliseconds, the media is valid for.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
A tuple of:
|
|
|
|
The Open Graph dictionary, if the oEmbed info can be parsed.
|
2022-01-18 13:20:24 -05:00
|
|
|
The author name if it could be retrieved from oEmbed.
|
2021-10-08 14:14:42 -04:00
|
|
|
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
|
|
|
"""
|
|
|
|
# If JSON was not returned, there's nothing to do.
|
|
|
|
if not _is_json(media_info.media_type):
|
2022-01-18 13:20:24 -05:00
|
|
|
return {}, None, expiration_ms
|
2021-10-08 14:14:42 -04:00
|
|
|
|
|
|
|
with open(media_info.filename, "rb") as file:
|
|
|
|
body = file.read()
|
|
|
|
|
|
|
|
oembed_response = self._oembed.parse_oembed_response(url, body)
|
|
|
|
open_graph_result = oembed_response.open_graph_result
|
|
|
|
|
|
|
|
# Use the cache age from the oEmbed result, if one was given.
|
|
|
|
if open_graph_result and oembed_response.cache_age is not None:
|
|
|
|
expiration_ms = oembed_response.cache_age
|
|
|
|
|
2022-01-18 13:20:24 -05:00
|
|
|
return open_graph_result, oembed_response.author_name, expiration_ms
|
2021-10-08 14:14:42 -04:00
|
|
|
|
2021-09-15 08:45:32 -04:00
|
|
|
def _start_expire_url_cache_data(self) -> Deferred:
|
2018-07-26 06:44:26 -04:00
|
|
|
return run_as_background_process(
|
2018-07-25 04:41:12 -04:00
|
|
|
"expire_url_cache_data", self._expire_url_cache_data
|
|
|
|
)
|
|
|
|
|
2021-01-15 10:57:37 -05:00
|
|
|
async def _expire_url_cache_data(self) -> None:
|
2017-09-28 07:18:06 -04:00
|
|
|
"""Clean up expired url cache content, media and thumbnails."""
|
2017-10-12 13:16:25 -04:00
|
|
|
|
2020-06-17 09:13:30 -04:00
|
|
|
assert self._worker_run_media_background_jobs
|
|
|
|
|
2017-09-28 07:18:06 -04:00
|
|
|
now = self.clock.time_msec()
|
|
|
|
|
2020-04-22 07:45:16 -04:00
|
|
|
logger.debug("Running url preview cache expiry")
|
2017-11-21 06:03:21 -05:00
|
|
|
|
2020-08-05 16:38:57 -04:00
|
|
|
if not (await self.store.db_pool.updates.has_completed_background_updates()):
|
2017-11-21 06:03:21 -05:00
|
|
|
logger.info("Still running DB updates; skipping expiry")
|
|
|
|
return
|
|
|
|
|
2021-09-29 05:24:37 -04:00
|
|
|
def try_remove_parent_dirs(dirs: Iterable[str]) -> None:
|
|
|
|
"""Attempt to remove the given chain of parent directories
|
|
|
|
|
|
|
|
Args:
|
|
|
|
dirs: The list of directory paths to delete, with children appearing
|
|
|
|
before their parents.
|
|
|
|
"""
|
|
|
|
for dir in dirs:
|
|
|
|
try:
|
|
|
|
os.rmdir(dir)
|
|
|
|
except FileNotFoundError:
|
|
|
|
# Already deleted, continue with deleting the rest
|
|
|
|
pass
|
|
|
|
except OSError as e:
|
|
|
|
# Failed, skip deleting the rest of the parent dirs
|
|
|
|
if e.errno != errno.ENOTEMPTY:
|
|
|
|
logger.warning(
|
|
|
|
"Failed to remove media directory: %r: %s", dir, e
|
|
|
|
)
|
|
|
|
break
|
|
|
|
|
2017-09-28 07:18:06 -04:00
|
|
|
# First we delete expired url cache entries
|
2020-03-20 07:20:02 -04:00
|
|
|
media_ids = await self.store.get_expired_url_cache(now)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
|
|
|
removed_media = []
|
|
|
|
for media_id in media_ids:
|
|
|
|
fname = self.filepaths.url_cache_filepath(media_id)
|
|
|
|
try:
|
|
|
|
os.remove(fname)
|
2021-09-29 05:24:37 -04:00
|
|
|
except FileNotFoundError:
|
|
|
|
pass # If the path doesn't exist, meh
|
2017-09-28 07:18:06 -04:00
|
|
|
except OSError as e:
|
2021-09-29 05:24:37 -04:00
|
|
|
logger.warning("Failed to remove media: %r: %s", media_id, e)
|
|
|
|
continue
|
2017-09-28 07:18:06 -04:00
|
|
|
|
|
|
|
removed_media.append(media_id)
|
|
|
|
|
2021-09-29 05:24:37 -04:00
|
|
|
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
|
|
|
try_remove_parent_dirs(dirs)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
2020-03-20 07:20:02 -04:00
|
|
|
await self.store.delete_url_cache(removed_media)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
2017-09-28 11:08:08 -04:00
|
|
|
if removed_media:
|
|
|
|
logger.info("Deleted %d entries from url cache", len(removed_media))
|
2020-04-22 07:45:16 -04:00
|
|
|
else:
|
|
|
|
logger.debug("No entries removed from url cache")
|
2017-09-28 07:18:06 -04:00
|
|
|
|
|
|
|
# Now we delete old images associated with the url cache.
|
|
|
|
# These may be cached for a bit on the client (i.e., they
|
|
|
|
# may have a room open with a preview url thing open).
|
|
|
|
# So we wait a couple of days before deleting, just in case.
|
2021-09-29 05:24:37 -04:00
|
|
|
expire_before = now - IMAGE_CACHE_EXPIRY_MS
|
2020-03-20 07:20:02 -04:00
|
|
|
media_ids = await self.store.get_url_cache_media_before(expire_before)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
|
|
|
removed_media = []
|
|
|
|
for media_id in media_ids:
|
|
|
|
fname = self.filepaths.url_cache_filepath(media_id)
|
|
|
|
try:
|
|
|
|
os.remove(fname)
|
2021-09-29 05:24:37 -04:00
|
|
|
except FileNotFoundError:
|
|
|
|
pass # If the path doesn't exist, meh
|
2017-09-28 07:18:06 -04:00
|
|
|
except OSError as e:
|
2021-09-29 05:24:37 -04:00
|
|
|
logger.warning("Failed to remove media: %r: %s", media_id, e)
|
|
|
|
continue
|
2017-09-28 07:18:06 -04:00
|
|
|
|
2021-09-29 05:24:37 -04:00
|
|
|
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
|
|
|
try_remove_parent_dirs(dirs)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
|
|
|
thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
|
|
|
|
try:
|
|
|
|
shutil.rmtree(thumbnail_dir)
|
2021-09-29 05:24:37 -04:00
|
|
|
except FileNotFoundError:
|
|
|
|
pass # If the path doesn't exist, meh
|
2017-09-28 07:18:06 -04:00
|
|
|
except OSError as e:
|
2021-09-29 05:24:37 -04:00
|
|
|
logger.warning("Failed to remove media: %r: %s", media_id, e)
|
|
|
|
continue
|
2017-09-28 07:18:06 -04:00
|
|
|
|
|
|
|
removed_media.append(media_id)
|
|
|
|
|
2021-09-29 05:24:37 -04:00
|
|
|
dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
|
|
|
|
# Note that one of the directories to be deleted has already been
|
|
|
|
# removed by the `rmtree` above.
|
|
|
|
try_remove_parent_dirs(dirs)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
2020-03-20 07:20:02 -04:00
|
|
|
await self.store.delete_url_cache_media(removed_media)
|
2017-09-28 07:18:06 -04:00
|
|
|
|
2020-04-22 07:45:16 -04:00
|
|
|
if removed_media:
|
|
|
|
logger.info("Deleted %d media from url cache", len(removed_media))
|
|
|
|
else:
|
|
|
|
logger.debug("No media removed from url cache")
|
2017-09-28 07:18:06 -04:00
|
|
|
|
2016-01-24 18:47:27 -05:00
|
|
|
|
2021-01-15 10:57:37 -05:00
|
|
|
def _is_media(content_type: str) -> bool:
|
|
|
|
return content_type.lower().startswith("image/")
|
2016-08-16 09:53:18 -04:00
|
|
|
|
|
|
|
|
2021-01-15 10:57:37 -05:00
|
|
|
def _is_html(content_type: str) -> bool:
|
2016-08-16 09:53:18 -04:00
|
|
|
content_type = content_type.lower()
|
2021-01-15 10:57:37 -05:00
|
|
|
return content_type.startswith("text/html") or content_type.startswith(
|
2016-08-16 09:53:18 -04:00
|
|
|
"application/xhtml"
|
2021-01-15 10:57:37 -05:00
|
|
|
)
|
2016-08-04 11:08:32 -04:00
|
|
|
|
|
|
|
|
2021-09-21 12:09:57 -04:00
|
|
|
def _is_json(content_type: str) -> bool:
|
|
|
|
return content_type.lower().startswith("application/json")
|
2022-02-10 10:43:01 -05:00
|
|
|
|
|
|
|
|
|
|
|
def _is_previewable(content_type: str) -> bool:
|
|
|
|
"""Returns True for content types for which we will perform URL preview and False
|
|
|
|
otherwise."""
|
|
|
|
|
|
|
|
return _is_html(content_type) or _is_media(content_type) or _is_json(content_type)
|