Autodiscover oEmbed endpoint from returned HTML (#10822)

Searches the returned HTML for an oEmbed endpoint using the
autodiscovery mechanism (`<link rel=...>`), and will request it
to generate the preview.
This commit is contained in:
Patrick Cloke 2021-10-08 14:14:42 -04:00 committed by GitHub
parent 593eeac19e
commit 1b112840d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 222 additions and 53 deletions

View File

@ -0,0 +1 @@
Support autodiscovery of oEmbed previews.

View File

@ -96,6 +96,32 @@ class OEmbedProvider:
# No match.
return None
def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
"""
Search an HTML document for oEmbed autodiscovery information.
Args:
tree: The parsed HTML body.
Returns:
The URL to use for oEmbed information, or None if no URL was found.
"""
# Search for link elements with the proper rel and type attributes.
for tag in tree.xpath(
"//link[@rel='alternate'][@type='application/json+oembed']"
):
if "href" in tag.attrib:
return tag.attrib["href"]
# Some providers (e.g. Flickr) use alternative instead of alternate.
for tag in tree.xpath(
"//link[@rel='alternative'][@type='application/json+oembed']"
):
if "href" in tag.attrib:
return tag.attrib["href"]
return None
def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
"""
Parse the oEmbed response into an Open Graph response.

View File

@ -22,7 +22,7 @@ import re
import shutil
import sys
import traceback
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
from urllib import parse as urlparse
import attr
@ -296,22 +296,32 @@ class PreviewUrlResource(DirectServeJsonResource):
body = file.read()
encoding = get_html_media_encoding(body, media_info.media_type)
og = decode_and_calc_og(body, media_info.uri, encoding)
tree = decode_body(body, encoding)
if tree is not None:
# Check if this HTML document points to oEmbed information and
# defer to that.
oembed_url = self._oembed.autodiscover_from_html(tree)
og = {}
if oembed_url:
oembed_info = await self._download_url(oembed_url, user)
og, expiration_ms = await self._handle_oembed_response(
url, oembed_info, expiration_ms
)
await self._precache_image_url(user, media_info, og)
# If there was no oEmbed URL (or oEmbed parsing failed), attempt
# to generate the Open Graph information from the HTML.
if not oembed_url or not og:
og = _calc_og(tree, media_info.uri)
elif oembed_url and _is_json(media_info.media_type):
# Handle an oEmbed response.
with open(media_info.filename, "rb") as file:
body = file.read()
oembed_response = self._oembed.parse_oembed_response(url, body)
og = oembed_response.open_graph_result
# Use the cache age from the oEmbed result, instead of the HTTP response.
if oembed_response.cache_age is not None:
expiration_ms = oembed_response.cache_age
await self._precache_image_url(user, media_info, og)
else:
og = {}
elif oembed_url:
# Handle the oEmbed information.
og, expiration_ms = await self._handle_oembed_response(
url, media_info, expiration_ms
)
await self._precache_image_url(user, media_info, og)
else:
@ -479,6 +489,39 @@ class PreviewUrlResource(DirectServeJsonResource):
else:
del og["og:image"]
async def _handle_oembed_response(
self, url: str, media_info: MediaInfo, expiration_ms: int
) -> Tuple[JsonDict, int]:
"""
Parse the downloaded oEmbed info.
Args:
url: The URL which is being previewed (not the one which was
requested).
media_info: The media being previewed.
expiration_ms: The length of time, in milliseconds, the media is valid for.
Returns:
A tuple of:
The Open Graph dictionary, if the oEmbed info can be parsed.
The (possibly updated) length of time, in milliseconds, the media is valid for.
"""
# If JSON was not returned, there's nothing to do.
if not _is_json(media_info.media_type):
return {}, expiration_ms
with open(media_info.filename, "rb") as file:
body = file.read()
oembed_response = self._oembed.parse_oembed_response(url, body)
open_graph_result = oembed_response.open_graph_result
# Use the cache age from the oEmbed result, if one was given.
if open_graph_result and oembed_response.cache_age is not None:
expiration_ms = oembed_response.cache_age
return open_graph_result, expiration_ms
def _start_expire_url_cache_data(self) -> Deferred:
return run_as_background_process(
"expire_url_cache_data", self._expire_url_cache_data
@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
return "utf-8"
def decode_and_calc_og(
body: bytes, media_uri: str, request_encoding: Optional[str] = None
) -> JsonDict:
def decode_body(
body: bytes, request_encoding: Optional[str] = None
) -> Optional["etree.Element"]:
"""
Calculate metadata for an HTML document.
This uses lxml to parse the HTML document into the OG response. If errors
occur during processing of the document, an empty response is returned.
This uses lxml to parse the HTML document.
Args:
body: The HTML document, as bytes.
media_url: The URI used to download the body.
request_encoding: The character encoding of the body, as a string.
Returns:
The OG response as a dictionary.
The parsed HTML body, or None if an error occurred during processed.
"""
# If there's no body, nothing useful is going to be found.
if not body:
return {}
return None
from lxml import etree
@ -662,25 +701,22 @@ def decode_and_calc_og(
parser = etree.HTMLParser(recover=True, encoding="utf-8")
except Exception as e:
logger.warning("Unable to create HTML parser: %s" % (e,))
return {}
return None
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
# Attempt to parse the body. If this fails, log and return no metadata.
tree = etree.fromstring(body_attempt, parser)
# The data was successfully parsed, but no tree was found.
if tree is None:
return {}
return _calc_og(tree, media_uri)
def _attempt_decode_body(
body_attempt: Union[bytes, str]
) -> Optional["etree.Element"]:
# Attempt to parse the body. Returns None if the body was successfully
# parsed, but no tree was found.
return etree.fromstring(body_attempt, parser)
# Attempt to parse the body. If this fails, log and return no metadata.
try:
return _attempt_calc_og(body)
return _attempt_decode_body(body)
except UnicodeDecodeError:
# blindly try decoding the body as utf-8, which seems to fix
# the charset mismatches on https://google.com
return _attempt_calc_og(body.decode("utf-8", "ignore"))
return _attempt_decode_body(body.decode("utf-8", "ignore"))
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:

View File

@ -725,9 +725,107 @@ class URLPreviewTests(unittest.HomeserverTestCase):
},
)
def test_oembed_autodiscovery(self):
"""
Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
1. Request a preview of a URL which is not known to the oEmbed code.
2. It returns HTML including a link to an oEmbed preview.
3. The oEmbed preview is requested and returns a URL for an image.
4. The image is requested for thumbnailing.
"""
# This is a little cheesy in that we use the www subdomain (which isn't the
# list of oEmbed patterns) to get "raw" HTML response.
self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
result = b"""
<link rel="alternate" type="application/json+oembed"
href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
title="matrixdotorg" />
"""
channel = self.make_request(
"GET",
"preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
shorthand=False,
await_result=False,
)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
)
% (len(result),)
+ result
)
self.pump()
# The oEmbed response.
result2 = {
"version": "1.0",
"type": "photo",
"url": "http://cdn.twitter.com/matrixdotorg",
}
oembed_content = json.dumps(result2).encode("utf-8")
# Ensure a second request is made to the oEmbed URL.
client = self.reactor.tcpClients[1][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
)
% (len(oembed_content),)
+ oembed_content
)
self.pump()
# Ensure the URL is what was requested.
self.assertIn(b"/oembed?", server.data)
# Ensure a third request is made to the photo URL.
client = self.reactor.tcpClients[2][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b"Content-Type: image/png\r\n\r\n"
)
% (len(SMALL_PNG),)
+ SMALL_PNG
)
self.pump()
# Ensure the URL is what was requested.
self.assertIn(b"/matrixdotorg", server.data)
self.assertEqual(channel.code, 200)
body = channel.json_body
self.assertEqual(
body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
)
self.assertTrue(body["og:image"].startswith("mxc://"))
self.assertEqual(body["og:image:height"], 1)
self.assertEqual(body["og:image:width"], 1)
self.assertEqual(body["og:image:type"], "image/png")
def _download_image(self):
"""Downloads an image into the URL cache.
Returns:
A (host, media_id) tuple representing the MXC URI of the image.
"""

View File

@ -13,7 +13,8 @@
# limitations under the License.
from synapse.rest.media.v1.preview_url_resource import (
decode_and_calc_og,
_calc_og,
decode_body,
get_html_media_encoding,
summarize_paragraphs,
)
@ -158,7 +159,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -173,7 +175,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -191,7 +194,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(
og,
@ -212,7 +216,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
@ -225,7 +230,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
@ -239,7 +245,8 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
@ -253,21 +260,22 @@ class CalcOgTestCase(unittest.TestCase):
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
def test_empty(self):
"""Test a body with no data in it."""
html = b""
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})
tree = decode_body(html)
self.assertIsNone(tree)
def test_no_tree(self):
"""A valid body with no tree in it."""
html = b"\x00"
og = decode_and_calc_og(html, "http://example.com/test.html")
self.assertEqual(og, {})
tree = decode_body(html)
self.assertIsNone(tree)
def test_invalid_encoding(self):
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
@ -279,9 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
</body>
</html>
"""
og = decode_and_calc_og(
html, "http://example.com/test.html", "invalid-encoding"
)
tree = decode_body(html, "invalid-encoding")
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
def test_invalid_encoding2(self):
@ -295,7 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
</body>
</html>
"""
og = decode_and_calc_og(html, "http://example.com/test.html")
tree = decode_body(html)
og = _calc_og(tree, "http://example.com/test.html")
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})