Support oEmbed for media previews. (#7920)

Fixes previews of Twitter URLs by using their oEmbed endpoint to grab content.
This commit is contained in:
Patrick Cloke 2020-07-27 07:50:44 -04:00 committed by GitHub
parent b975fa2e99
commit 3fc8fdd150
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 355 additions and 53 deletions

1
changelog.d/7920.feature Normal file
View File

@ -0,0 +1 @@
Support oEmbed for media previews.

View File

@ -26,6 +26,7 @@ import traceback
from typing import Dict, Optional from typing import Dict, Optional
from urllib import parse as urlparse from urllib import parse as urlparse
import attr
from canonicaljson import json from canonicaljson import json
from twisted.internet import defer from twisted.internet import defer
@ -56,6 +57,65 @@ _content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
OG_TAG_NAME_MAXLEN = 50 OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000 OG_TAG_VALUE_MAXLEN = 1000
ONE_HOUR = 60 * 60 * 1000
# A map of globs to API endpoints.
_oembed_globs = {
# Twitter.
"https://publish.twitter.com/oembed": [
"https://twitter.com/*/status/*",
"https://*.twitter.com/*/status/*",
"https://twitter.com/*/moments/*",
"https://*.twitter.com/*/moments/*",
# Include the HTTP versions too.
"http://twitter.com/*/status/*",
"http://*.twitter.com/*/status/*",
"http://twitter.com/*/moments/*",
"http://*.twitter.com/*/moments/*",
],
}
# Convert the globs to regular expressions.
_oembed_patterns = {}
for endpoint, globs in _oembed_globs.items():
for glob in globs:
# Convert the glob into a sane regular expression to match against. The
# rules followed will be slightly different for the domain portion vs.
# the rest.
#
# 1. The scheme must be one of HTTP / HTTPS (and have no globs).
# 2. The domain can have globs, but we limit it to characters that can
# reasonably be a domain part.
# TODO: This does not attempt to handle Unicode domain names.
# 3. Other parts allow a glob to be any one, or more, characters.
results = urlparse.urlparse(glob)
# Ensure the scheme does not have wildcards (and is a sane scheme).
if results.scheme not in {"http", "https"}:
raise ValueError("Insecure oEmbed glob scheme: %s" % (results.scheme,))
pattern = urlparse.urlunparse(
[
results.scheme,
re.escape(results.netloc).replace("\\*", "[a-zA-Z0-9_-]+"),
]
+ [re.escape(part).replace("\\*", ".+") for part in results[2:]]
)
_oembed_patterns[re.compile(pattern)] = endpoint
@attr.s
class OEmbedResult:
# Either HTML content or URL must be provided.
html = attr.ib(type=Optional[str])
url = attr.ib(type=Optional[str])
title = attr.ib(type=Optional[str])
# Number of seconds to cache the content.
cache_age = attr.ib(type=int)
class OEmbedError(Exception):
"""An error occurred processing the oEmbed object."""
class PreviewUrlResource(DirectServeJsonResource): class PreviewUrlResource(DirectServeJsonResource):
isLeaf = True isLeaf = True
@ -99,7 +159,7 @@ class PreviewUrlResource(DirectServeJsonResource):
cache_name="url_previews", cache_name="url_previews",
clock=self.clock, clock=self.clock,
# don't spider URLs more often than once an hour # don't spider URLs more often than once an hour
expiry_ms=60 * 60 * 1000, expiry_ms=ONE_HOUR,
) )
if self._worker_run_media_background_jobs: if self._worker_run_media_background_jobs:
@ -310,6 +370,87 @@ class PreviewUrlResource(DirectServeJsonResource):
return jsonog.encode("utf8") return jsonog.encode("utf8")
def _get_oembed_url(self, url: str) -> Optional[str]:
"""
Check whether the URL should be downloaded as oEmbed content instead.
Params:
url: The URL to check.
Returns:
A URL to use instead or None if the original URL should be used.
"""
for url_pattern, endpoint in _oembed_patterns.items():
if url_pattern.fullmatch(url):
return endpoint
# No match.
return None
async def _get_oembed_content(self, endpoint: str, url: str) -> OEmbedResult:
"""
Request content from an oEmbed endpoint.
Params:
endpoint: The oEmbed API endpoint.
url: The URL to pass to the API.
Returns:
An object representing the metadata returned.
Raises:
OEmbedError if fetching or parsing of the oEmbed information fails.
"""
try:
logger.debug("Trying to get oEmbed content for url '%s'", url)
result = await self.client.get_json(
endpoint,
# TODO Specify max height / width.
# Note that only the JSON format is supported.
args={"url": url},
)
# Ensure there's a version of 1.0.
if result.get("version") != "1.0":
raise OEmbedError("Invalid version: %s" % (result.get("version"),))
oembed_type = result.get("type")
# Ensure the cache age is None or an int.
cache_age = result.get("cache_age")
if cache_age:
cache_age = int(cache_age)
oembed_result = OEmbedResult(None, None, result.get("title"), cache_age)
# HTML content.
if oembed_type == "rich":
oembed_result.html = result.get("html")
return oembed_result
if oembed_type == "photo":
oembed_result.url = result.get("url")
return oembed_result
# TODO Handle link and video types.
if "thumbnail_url" in result:
oembed_result.url = result.get("thumbnail_url")
return oembed_result
raise OEmbedError("Incompatible oEmbed information.")
except OEmbedError as e:
# Trap OEmbedErrors first so we can directly re-raise them.
logger.warning("Error parsing oEmbed metadata from %s: %r", url, e)
raise
except Exception as e:
# Trap any exception and let the code follow as usual.
# FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading oEmbed metadata from %s: %r", url, e)
raise OEmbedError() from e
async def _download_url(self, url, user): async def _download_url(self, url, user):
# TODO: we should probably honour robots.txt... except in practice # TODO: we should probably honour robots.txt... except in practice
# we're most likely being explicitly triggered by a human rather than a # we're most likely being explicitly triggered by a human rather than a
@ -319,11 +460,27 @@ class PreviewUrlResource(DirectServeJsonResource):
file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True) file_info = FileInfo(server_name=None, file_id=file_id, url_cache=True)
# If this URL can be accessed via oEmbed, use that instead.
url_to_download = url
oembed_url = self._get_oembed_url(url)
if oembed_url:
# The result might be a new URL to download, or it might be HTML content.
try:
oembed_result = await self._get_oembed_content(oembed_url, url)
if oembed_result.url:
url_to_download = oembed_result.url
elif oembed_result.html:
url_to_download = None
except OEmbedError:
# If an error occurs, try doing a normal preview.
pass
if url_to_download:
with self.media_storage.store_into_file(file_info) as (f, fname, finish): with self.media_storage.store_into_file(file_info) as (f, fname, finish):
try: try:
logger.debug("Trying to get preview for url '%s'", url) logger.debug("Trying to get preview for url '%s'", url_to_download)
length, headers, uri, code = await self.client.get_file( length, headers, uri, code = await self.client.get_file(
url, url_to_download,
output_stream=f, output_stream=f,
max_size=self.max_spider_size, max_size=self.max_spider_size,
headers={"Accept-Language": self.url_preview_accept_language}, headers={"Accept-Language": self.url_preview_accept_language},
@ -344,7 +501,7 @@ class PreviewUrlResource(DirectServeJsonResource):
) )
except Exception as e: except Exception as e:
# FIXME: pass through 404s and other error messages nicely # FIXME: pass through 404s and other error messages nicely
logger.warning("Error downloading %s: %r", url, e) logger.warning("Error downloading %s: %r", url_to_download, e)
raise SynapseError( raise SynapseError(
500, 500,
@ -354,19 +511,39 @@ class PreviewUrlResource(DirectServeJsonResource):
) )
await finish() await finish()
try:
if b"Content-Type" in headers: if b"Content-Type" in headers:
media_type = headers[b"Content-Type"][0].decode("ascii") media_type = headers[b"Content-Type"][0].decode("ascii")
else: else:
media_type = "application/octet-stream" media_type = "application/octet-stream"
time_now_ms = self.clock.time_msec()
download_name = get_filename_from_headers(headers) download_name = get_filename_from_headers(headers)
# FIXME: we should calculate a proper expiration based on the
# Cache-Control and Expire headers. But for now, assume 1 hour.
expires = ONE_HOUR
etag = headers["ETag"][0] if "ETag" in headers else None
else:
html_bytes = oembed_result.html.encode("utf-8") # type: ignore
with self.media_storage.store_into_file(file_info) as (f, fname, finish):
f.write(html_bytes)
await finish()
media_type = "text/html"
download_name = oembed_result.title
length = len(html_bytes)
# If a specific cache age was not given, assume 1 hour.
expires = oembed_result.cache_age or ONE_HOUR
uri = oembed_url
code = 200
etag = None
try:
time_now_ms = self.clock.time_msec()
await self.store.store_local_media( await self.store.store_local_media(
media_id=file_id, media_id=file_id,
media_type=media_type, media_type=media_type,
time_now_ms=self.clock.time_msec(), time_now_ms=time_now_ms,
upload_name=download_name, upload_name=download_name,
media_length=length, media_length=length,
user_id=user, user_id=user,
@ -389,10 +566,8 @@ class PreviewUrlResource(DirectServeJsonResource):
"filename": fname, "filename": fname,
"uri": uri, "uri": uri,
"response_code": code, "response_code": code,
# FIXME: we should calculate a proper expiration based on the "expires": expires,
# Cache-Control and Expire headers. But for now, assume 1 hour. "etag": etag,
"expires": 60 * 60 * 1000,
"etag": headers["ETag"][0] if "ETag" in headers else None,
} }
def _start_expire_url_cache_data(self): def _start_expire_url_cache_data(self):
@ -449,7 +624,7 @@ class PreviewUrlResource(DirectServeJsonResource):
# These may be cached for a bit on the client (i.e., they # These may be cached for a bit on the client (i.e., they
# may have a room open with a preview url thing open). # may have a room open with a preview url thing open).
# So we wait a couple of days before deleting, just in case. # So we wait a couple of days before deleting, just in case.
expire_before = now - 2 * 24 * 60 * 60 * 1000 expire_before = now - 2 * 24 * ONE_HOUR
media_ids = await self.store.get_url_cache_media_before(expire_before) media_ids = await self.store.get_url_cache_media_before(expire_before)
removed_media = [] removed_media = []

View File

@ -12,8 +12,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import json
import os import os
import re
from mock import patch
import attr import attr
@ -131,7 +134,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.reactor.nameResolver = Resolver() self.reactor.nameResolver = Resolver()
def test_cache_returns_correct_type(self): def test_cache_returns_correct_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
request, channel = self.make_request( request, channel = self.make_request(
"GET", "url_preview?url=http://matrix.org", shorthand=False "GET", "url_preview?url=http://matrix.org", shorthand=False
@ -187,7 +190,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
) )
def test_non_ascii_preview_httpequiv(self): def test_non_ascii_preview_httpequiv(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = ( end_content = (
b"<html><head>" b"<html><head>"
@ -221,7 +224,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
def test_non_ascii_preview_content_type(self): def test_non_ascii_preview_content_type(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = ( end_content = (
b"<html><head>" b"<html><head>"
@ -254,7 +257,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430") self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
def test_overlong_title(self): def test_overlong_title(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")] self.lookups["matrix.org"] = [(IPv4Address, "10.1.2.3")]
end_content = ( end_content = (
b"<html><head>" b"<html><head>"
@ -292,7 +295,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
""" """
IP addresses can be previewed directly. IP addresses can be previewed directly.
""" """
self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")] self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
request, channel = self.make_request( request, channel = self.make_request(
"GET", "url_preview?url=http://example.com", shorthand=False "GET", "url_preview?url=http://example.com", shorthand=False
@ -439,7 +442,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
# Hardcode the URL resolving to the IP we want. # Hardcode the URL resolving to the IP we want.
self.lookups["example.com"] = [ self.lookups["example.com"] = [
(IPv4Address, "1.1.1.2"), (IPv4Address, "1.1.1.2"),
(IPv4Address, "8.8.8.8"), (IPv4Address, "10.1.2.3"),
] ]
request, channel = self.make_request( request, channel = self.make_request(
@ -518,7 +521,7 @@ class URLPreviewTests(unittest.HomeserverTestCase):
""" """
Accept-Language header is sent to the remote server Accept-Language header is sent to the remote server
""" """
self.lookups["example.com"] = [(IPv4Address, "8.8.8.8")] self.lookups["example.com"] = [(IPv4Address, "10.1.2.3")]
# Build and make a request to the server # Build and make a request to the server
request, channel = self.make_request( request, channel = self.make_request(
@ -562,3 +565,126 @@ class URLPreviewTests(unittest.HomeserverTestCase):
), ),
server.data, server.data,
) )
def test_oembed_photo(self):
"""Test an oEmbed endpoint which returns a 'photo' type which redirects the preview to a new URL."""
# Route the HTTP version to an HTTP endpoint so that the tests work.
with patch.dict(
"synapse.rest.media.v1.preview_url_resource._oembed_patterns",
{
re.compile(
r"http://twitter\.com/.+/status/.+"
): "http://publish.twitter.com/oembed",
},
clear=True,
):
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
result = {
"version": "1.0",
"type": "photo",
"url": "http://cdn.twitter.com/matrixdotorg",
}
oembed_content = json.dumps(result).encode("utf-8")
end_content = (
b"<html><head>"
b"<title>Some Title</title>"
b'<meta property="og:description" content="hi" />'
b"</head></html>"
)
request, channel = self.make_request(
"GET",
"url_preview?url=http://twitter.com/matrixdotorg/status/12345",
shorthand=False,
)
request.render(self.preview_url)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(oembed_content),)
+ oembed_content
)
self.pump()
client = self.reactor.tcpClients[1][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)
self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "Some Title", "og:description": "hi"}
)
def test_oembed_rich(self):
"""Test an oEmbed endpoint which returns HTML content via the 'rich' type."""
# Route the HTTP version to an HTTP endpoint so that the tests work.
with patch.dict(
"synapse.rest.media.v1.preview_url_resource._oembed_patterns",
{
re.compile(
r"http://twitter\.com/.+/status/.+"
): "http://publish.twitter.com/oembed",
},
clear=True,
):
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
result = {
"version": "1.0",
"type": "rich",
"html": "<div>Content Preview</div>",
}
end_content = json.dumps(result).encode("utf-8")
request, channel = self.make_request(
"GET",
"url_preview?url=http://twitter.com/matrixdotorg/status/12345",
shorthand=False,
)
request.render(self.preview_url)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)
self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body,
{"og:title": None, "og:description": "Content Preview"},
)