mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-01-12 08:49:28 -05:00
Autodiscover oEmbed endpoint from returned HTML (#10822)
Searches the returned HTML for an oEmbed endpoint using the autodiscovery mechanism (`<link rel=...>`), and will request it to generate the preview.
This commit is contained in:
parent
593eeac19e
commit
1b112840d2
1
changelog.d/10822.feature
Normal file
1
changelog.d/10822.feature
Normal file
@ -0,0 +1 @@
|
|||||||
|
Support autodiscovery of oEmbed previews.
|
@ -96,6 +96,32 @@ class OEmbedProvider:
|
|||||||
# No match.
|
# No match.
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def autodiscover_from_html(self, tree: "etree.Element") -> Optional[str]:
|
||||||
|
"""
|
||||||
|
Search an HTML document for oEmbed autodiscovery information.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tree: The parsed HTML body.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The URL to use for oEmbed information, or None if no URL was found.
|
||||||
|
"""
|
||||||
|
# Search for link elements with the proper rel and type attributes.
|
||||||
|
for tag in tree.xpath(
|
||||||
|
"//link[@rel='alternate'][@type='application/json+oembed']"
|
||||||
|
):
|
||||||
|
if "href" in tag.attrib:
|
||||||
|
return tag.attrib["href"]
|
||||||
|
|
||||||
|
# Some providers (e.g. Flickr) use alternative instead of alternate.
|
||||||
|
for tag in tree.xpath(
|
||||||
|
"//link[@rel='alternative'][@type='application/json+oembed']"
|
||||||
|
):
|
||||||
|
if "href" in tag.attrib:
|
||||||
|
return tag.attrib["href"]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
|
def parse_oembed_response(self, url: str, raw_body: bytes) -> OEmbedResult:
|
||||||
"""
|
"""
|
||||||
Parse the oEmbed response into an Open Graph response.
|
Parse the oEmbed response into an Open Graph response.
|
||||||
|
@ -22,7 +22,7 @@ import re
|
|||||||
import shutil
|
import shutil
|
||||||
import sys
|
import sys
|
||||||
import traceback
|
import traceback
|
||||||
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Union
|
from typing import TYPE_CHECKING, Dict, Generator, Iterable, Optional, Tuple, Union
|
||||||
from urllib import parse as urlparse
|
from urllib import parse as urlparse
|
||||||
|
|
||||||
import attr
|
import attr
|
||||||
@ -296,22 +296,32 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||||||
body = file.read()
|
body = file.read()
|
||||||
|
|
||||||
encoding = get_html_media_encoding(body, media_info.media_type)
|
encoding = get_html_media_encoding(body, media_info.media_type)
|
||||||
og = decode_and_calc_og(body, media_info.uri, encoding)
|
tree = decode_body(body, encoding)
|
||||||
|
if tree is not None:
|
||||||
|
# Check if this HTML document points to oEmbed information and
|
||||||
|
# defer to that.
|
||||||
|
oembed_url = self._oembed.autodiscover_from_html(tree)
|
||||||
|
og = {}
|
||||||
|
if oembed_url:
|
||||||
|
oembed_info = await self._download_url(oembed_url, user)
|
||||||
|
og, expiration_ms = await self._handle_oembed_response(
|
||||||
|
url, oembed_info, expiration_ms
|
||||||
|
)
|
||||||
|
|
||||||
await self._precache_image_url(user, media_info, og)
|
# If there was no oEmbed URL (or oEmbed parsing failed), attempt
|
||||||
|
# to generate the Open Graph information from the HTML.
|
||||||
|
if not oembed_url or not og:
|
||||||
|
og = _calc_og(tree, media_info.uri)
|
||||||
|
|
||||||
elif oembed_url and _is_json(media_info.media_type):
|
await self._precache_image_url(user, media_info, og)
|
||||||
# Handle an oEmbed response.
|
else:
|
||||||
with open(media_info.filename, "rb") as file:
|
og = {}
|
||||||
body = file.read()
|
|
||||||
|
|
||||||
oembed_response = self._oembed.parse_oembed_response(url, body)
|
|
||||||
og = oembed_response.open_graph_result
|
|
||||||
|
|
||||||
# Use the cache age from the oEmbed result, instead of the HTTP response.
|
|
||||||
if oembed_response.cache_age is not None:
|
|
||||||
expiration_ms = oembed_response.cache_age
|
|
||||||
|
|
||||||
|
elif oembed_url:
|
||||||
|
# Handle the oEmbed information.
|
||||||
|
og, expiration_ms = await self._handle_oembed_response(
|
||||||
|
url, media_info, expiration_ms
|
||||||
|
)
|
||||||
await self._precache_image_url(user, media_info, og)
|
await self._precache_image_url(user, media_info, og)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -479,6 +489,39 @@ class PreviewUrlResource(DirectServeJsonResource):
|
|||||||
else:
|
else:
|
||||||
del og["og:image"]
|
del og["og:image"]
|
||||||
|
|
||||||
|
async def _handle_oembed_response(
|
||||||
|
self, url: str, media_info: MediaInfo, expiration_ms: int
|
||||||
|
) -> Tuple[JsonDict, int]:
|
||||||
|
"""
|
||||||
|
Parse the downloaded oEmbed info.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL which is being previewed (not the one which was
|
||||||
|
requested).
|
||||||
|
media_info: The media being previewed.
|
||||||
|
expiration_ms: The length of time, in milliseconds, the media is valid for.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple of:
|
||||||
|
The Open Graph dictionary, if the oEmbed info can be parsed.
|
||||||
|
The (possibly updated) length of time, in milliseconds, the media is valid for.
|
||||||
|
"""
|
||||||
|
# If JSON was not returned, there's nothing to do.
|
||||||
|
if not _is_json(media_info.media_type):
|
||||||
|
return {}, expiration_ms
|
||||||
|
|
||||||
|
with open(media_info.filename, "rb") as file:
|
||||||
|
body = file.read()
|
||||||
|
|
||||||
|
oembed_response = self._oembed.parse_oembed_response(url, body)
|
||||||
|
open_graph_result = oembed_response.open_graph_result
|
||||||
|
|
||||||
|
# Use the cache age from the oEmbed result, if one was given.
|
||||||
|
if open_graph_result and oembed_response.cache_age is not None:
|
||||||
|
expiration_ms = oembed_response.cache_age
|
||||||
|
|
||||||
|
return open_graph_result, expiration_ms
|
||||||
|
|
||||||
def _start_expire_url_cache_data(self) -> Deferred:
|
def _start_expire_url_cache_data(self) -> Deferred:
|
||||||
return run_as_background_process(
|
return run_as_background_process(
|
||||||
"expire_url_cache_data", self._expire_url_cache_data
|
"expire_url_cache_data", self._expire_url_cache_data
|
||||||
@ -631,26 +674,22 @@ def get_html_media_encoding(body: bytes, content_type: str) -> str:
|
|||||||
return "utf-8"
|
return "utf-8"
|
||||||
|
|
||||||
|
|
||||||
def decode_and_calc_og(
|
def decode_body(
|
||||||
body: bytes, media_uri: str, request_encoding: Optional[str] = None
|
body: bytes, request_encoding: Optional[str] = None
|
||||||
) -> JsonDict:
|
) -> Optional["etree.Element"]:
|
||||||
"""
|
"""
|
||||||
Calculate metadata for an HTML document.
|
This uses lxml to parse the HTML document.
|
||||||
|
|
||||||
This uses lxml to parse the HTML document into the OG response. If errors
|
|
||||||
occur during processing of the document, an empty response is returned.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
body: The HTML document, as bytes.
|
body: The HTML document, as bytes.
|
||||||
media_url: The URI used to download the body.
|
|
||||||
request_encoding: The character encoding of the body, as a string.
|
request_encoding: The character encoding of the body, as a string.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The OG response as a dictionary.
|
The parsed HTML body, or None if an error occurred during processed.
|
||||||
"""
|
"""
|
||||||
# If there's no body, nothing useful is going to be found.
|
# If there's no body, nothing useful is going to be found.
|
||||||
if not body:
|
if not body:
|
||||||
return {}
|
return None
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
@ -662,25 +701,22 @@ def decode_and_calc_og(
|
|||||||
parser = etree.HTMLParser(recover=True, encoding="utf-8")
|
parser = etree.HTMLParser(recover=True, encoding="utf-8")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Unable to create HTML parser: %s" % (e,))
|
logger.warning("Unable to create HTML parser: %s" % (e,))
|
||||||
return {}
|
return None
|
||||||
|
|
||||||
def _attempt_calc_og(body_attempt: Union[bytes, str]) -> Dict[str, Optional[str]]:
|
def _attempt_decode_body(
|
||||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
body_attempt: Union[bytes, str]
|
||||||
tree = etree.fromstring(body_attempt, parser)
|
) -> Optional["etree.Element"]:
|
||||||
|
# Attempt to parse the body. Returns None if the body was successfully
|
||||||
# The data was successfully parsed, but no tree was found.
|
# parsed, but no tree was found.
|
||||||
if tree is None:
|
return etree.fromstring(body_attempt, parser)
|
||||||
return {}
|
|
||||||
|
|
||||||
return _calc_og(tree, media_uri)
|
|
||||||
|
|
||||||
# Attempt to parse the body. If this fails, log and return no metadata.
|
# Attempt to parse the body. If this fails, log and return no metadata.
|
||||||
try:
|
try:
|
||||||
return _attempt_calc_og(body)
|
return _attempt_decode_body(body)
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
# blindly try decoding the body as utf-8, which seems to fix
|
# blindly try decoding the body as utf-8, which seems to fix
|
||||||
# the charset mismatches on https://google.com
|
# the charset mismatches on https://google.com
|
||||||
return _attempt_calc_og(body.decode("utf-8", "ignore"))
|
return _attempt_decode_body(body.decode("utf-8", "ignore"))
|
||||||
|
|
||||||
|
|
||||||
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
|
def _calc_og(tree: "etree.Element", media_uri: str) -> Dict[str, Optional[str]]:
|
||||||
|
@ -725,9 +725,107 @@ class URLPreviewTests(unittest.HomeserverTestCase):
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_oembed_autodiscovery(self):
|
||||||
|
"""
|
||||||
|
Autodiscovery works by finding the link in the HTML response and then requesting an oEmbed URL.
|
||||||
|
1. Request a preview of a URL which is not known to the oEmbed code.
|
||||||
|
2. It returns HTML including a link to an oEmbed preview.
|
||||||
|
3. The oEmbed preview is requested and returns a URL for an image.
|
||||||
|
4. The image is requested for thumbnailing.
|
||||||
|
"""
|
||||||
|
# This is a little cheesy in that we use the www subdomain (which isn't the
|
||||||
|
# list of oEmbed patterns) to get "raw" HTML response.
|
||||||
|
self.lookups["www.twitter.com"] = [(IPv4Address, "10.1.2.3")]
|
||||||
|
self.lookups["publish.twitter.com"] = [(IPv4Address, "10.1.2.3")]
|
||||||
|
self.lookups["cdn.twitter.com"] = [(IPv4Address, "10.1.2.3")]
|
||||||
|
|
||||||
|
result = b"""
|
||||||
|
<link rel="alternate" type="application/json+oembed"
|
||||||
|
href="http://publish.twitter.com/oembed?url=http%3A%2F%2Fcdn.twitter.com%2Fmatrixdotorg%2Fstatus%2F12345&format=json"
|
||||||
|
title="matrixdotorg" />
|
||||||
|
"""
|
||||||
|
|
||||||
|
channel = self.make_request(
|
||||||
|
"GET",
|
||||||
|
"preview_url?url=http://www.twitter.com/matrixdotorg/status/12345",
|
||||||
|
shorthand=False,
|
||||||
|
await_result=False,
|
||||||
|
)
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
client = self.reactor.tcpClients[0][2].buildProtocol(None)
|
||||||
|
server = AccumulatingProtocol()
|
||||||
|
server.makeConnection(FakeTransport(client, self.reactor))
|
||||||
|
client.makeConnection(FakeTransport(server, self.reactor))
|
||||||
|
client.dataReceived(
|
||||||
|
(
|
||||||
|
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
|
||||||
|
b'Content-Type: text/html; charset="utf8"\r\n\r\n'
|
||||||
|
)
|
||||||
|
% (len(result),)
|
||||||
|
+ result
|
||||||
|
)
|
||||||
|
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
# The oEmbed response.
|
||||||
|
result2 = {
|
||||||
|
"version": "1.0",
|
||||||
|
"type": "photo",
|
||||||
|
"url": "http://cdn.twitter.com/matrixdotorg",
|
||||||
|
}
|
||||||
|
oembed_content = json.dumps(result2).encode("utf-8")
|
||||||
|
|
||||||
|
# Ensure a second request is made to the oEmbed URL.
|
||||||
|
client = self.reactor.tcpClients[1][2].buildProtocol(None)
|
||||||
|
server = AccumulatingProtocol()
|
||||||
|
server.makeConnection(FakeTransport(client, self.reactor))
|
||||||
|
client.makeConnection(FakeTransport(server, self.reactor))
|
||||||
|
client.dataReceived(
|
||||||
|
(
|
||||||
|
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
|
||||||
|
b'Content-Type: application/json; charset="utf8"\r\n\r\n'
|
||||||
|
)
|
||||||
|
% (len(oembed_content),)
|
||||||
|
+ oembed_content
|
||||||
|
)
|
||||||
|
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
# Ensure the URL is what was requested.
|
||||||
|
self.assertIn(b"/oembed?", server.data)
|
||||||
|
|
||||||
|
# Ensure a third request is made to the photo URL.
|
||||||
|
client = self.reactor.tcpClients[2][2].buildProtocol(None)
|
||||||
|
server = AccumulatingProtocol()
|
||||||
|
server.makeConnection(FakeTransport(client, self.reactor))
|
||||||
|
client.makeConnection(FakeTransport(server, self.reactor))
|
||||||
|
client.dataReceived(
|
||||||
|
(
|
||||||
|
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
|
||||||
|
b"Content-Type: image/png\r\n\r\n"
|
||||||
|
)
|
||||||
|
% (len(SMALL_PNG),)
|
||||||
|
+ SMALL_PNG
|
||||||
|
)
|
||||||
|
|
||||||
|
self.pump()
|
||||||
|
|
||||||
|
# Ensure the URL is what was requested.
|
||||||
|
self.assertIn(b"/matrixdotorg", server.data)
|
||||||
|
|
||||||
|
self.assertEqual(channel.code, 200)
|
||||||
|
body = channel.json_body
|
||||||
|
self.assertEqual(
|
||||||
|
body["og:url"], "http://www.twitter.com/matrixdotorg/status/12345"
|
||||||
|
)
|
||||||
|
self.assertTrue(body["og:image"].startswith("mxc://"))
|
||||||
|
self.assertEqual(body["og:image:height"], 1)
|
||||||
|
self.assertEqual(body["og:image:width"], 1)
|
||||||
|
self.assertEqual(body["og:image:type"], "image/png")
|
||||||
|
|
||||||
def _download_image(self):
|
def _download_image(self):
|
||||||
"""Downloads an image into the URL cache.
|
"""Downloads an image into the URL cache.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A (host, media_id) tuple representing the MXC URI of the image.
|
A (host, media_id) tuple representing the MXC URI of the image.
|
||||||
"""
|
"""
|
||||||
|
@ -13,7 +13,8 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
from synapse.rest.media.v1.preview_url_resource import (
|
from synapse.rest.media.v1.preview_url_resource import (
|
||||||
decode_and_calc_og,
|
_calc_og,
|
||||||
|
decode_body,
|
||||||
get_html_media_encoding,
|
get_html_media_encoding,
|
||||||
summarize_paragraphs,
|
summarize_paragraphs,
|
||||||
)
|
)
|
||||||
@ -158,7 +159,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
@ -173,7 +175,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
@ -191,7 +194,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
og,
|
og,
|
||||||
@ -212,7 +216,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
@ -225,7 +230,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||||
|
|
||||||
@ -239,7 +245,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Title", "og:description": "Some text."})
|
||||||
|
|
||||||
@ -253,21 +260,22 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
|
|
||||||
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": None, "og:description": "Some text."})
|
||||||
|
|
||||||
def test_empty(self):
|
def test_empty(self):
|
||||||
"""Test a body with no data in it."""
|
"""Test a body with no data in it."""
|
||||||
html = b""
|
html = b""
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
self.assertEqual(og, {})
|
self.assertIsNone(tree)
|
||||||
|
|
||||||
def test_no_tree(self):
|
def test_no_tree(self):
|
||||||
"""A valid body with no tree in it."""
|
"""A valid body with no tree in it."""
|
||||||
html = b"\x00"
|
html = b"\x00"
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
self.assertEqual(og, {})
|
self.assertIsNone(tree)
|
||||||
|
|
||||||
def test_invalid_encoding(self):
|
def test_invalid_encoding(self):
|
||||||
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
"""An invalid character encoding should be ignored and treated as UTF-8, if possible."""
|
||||||
@ -279,9 +287,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
og = decode_and_calc_og(
|
tree = decode_body(html, "invalid-encoding")
|
||||||
html, "http://example.com/test.html", "invalid-encoding"
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
)
|
|
||||||
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
def test_invalid_encoding2(self):
|
def test_invalid_encoding2(self):
|
||||||
@ -295,7 +302,8 @@ class CalcOgTestCase(unittest.TestCase):
|
|||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
og = decode_and_calc_og(html, "http://example.com/test.html")
|
tree = decode_body(html)
|
||||||
|
og = _calc_og(tree, "http://example.com/test.html")
|
||||||
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
self.assertEqual(og, {"og:title": "ÿÿ Foo", "og:description": "Some text."})
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user