Cache failures in url_preview handler

Reshuffle the caching logic in the url_preview handler so that failures are
cached (and to generally simplify things and fix the logcontext leaks).
This commit is contained in:
Richard van der Hoff 2017-11-10 16:34:33 +00:00
parent 4d0414c714
commit 46790f50cf

View File

@ -20,6 +20,7 @@ from twisted.web.resource import Resource
from synapse.api.errors import ( from synapse.api.errors import (
SynapseError, Codes, SynapseError, Codes,
) )
from synapse.util.logcontext import preserve_fn, make_deferred_yieldable
from synapse.util.stringutils import random_string from synapse.util.stringutils import random_string
from synapse.util.caches.expiringcache import ExpiringCache from synapse.util.caches.expiringcache import ExpiringCache
from synapse.http.client import SpiderHttpClient from synapse.http.client import SpiderHttpClient
@ -63,16 +64,15 @@ class PreviewUrlResource(Resource):
self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist self.url_preview_url_blacklist = hs.config.url_preview_url_blacklist
# simple memory cache mapping urls to OG metadata # memory cache mapping urls to an ObservableDeferred returning
self.cache = ExpiringCache( # JSON-encoded OG metadata
self._cache = ExpiringCache(
cache_name="url_previews", cache_name="url_previews",
clock=self.clock, clock=self.clock,
# don't spider URLs more often than once an hour # don't spider URLs more often than once an hour
expiry_ms=60 * 60 * 1000, expiry_ms=60 * 60 * 1000,
) )
self.cache.start() self._cache.start()
self.downloads = {}
self._cleaner_loop = self.clock.looping_call( self._cleaner_loop = self.clock.looping_call(
self._expire_url_cache_data, 10 * 1000 self._expire_url_cache_data, 10 * 1000
@ -94,6 +94,7 @@ class PreviewUrlResource(Resource):
else: else:
ts = self.clock.time_msec() ts = self.clock.time_msec()
# XXX: we could move this into _do_preview if we wanted.
url_tuple = urlparse.urlsplit(url) url_tuple = urlparse.urlsplit(url)
for entry in self.url_preview_url_blacklist: for entry in self.url_preview_url_blacklist:
match = True match = True
@ -126,14 +127,40 @@ class PreviewUrlResource(Resource):
Codes.UNKNOWN Codes.UNKNOWN
) )
# first check the memory cache - good to handle all the clients on this # the in-memory cache:
# HS thundering away to preview the same URL at the same time. # * ensures that only one request is active at a time
og = self.cache.get(url) # * takes load off the DB for the thundering herds
if og: # * also caches any failures (unlike the DB) so we don't keep
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) # requesting the same endpoint
return
# then check the URL cache in the DB (which will also provide us with observable = self._cache.get(url)
if not observable:
download = preserve_fn(self._do_preview)(
url, requester.user, ts,
)
observable = ObservableDeferred(
download,
consumeErrors=True
)
self._cache[url] = observable
og = yield make_deferred_yieldable(observable.observe())
respond_with_json_bytes(request, 200, og, send_cors=True)
@defer.inlineCallbacks
def _do_preview(self, url, user, ts):
"""Check the db, and download the URL and build a preview
Args:
url (str):
user (str):
ts (int):
Returns:
Deferred[str]: json-encoded og data
"""
# check the URL cache in the DB (which will also provide us with
# historical previews, if we have any) # historical previews, if we have any)
cache_result = yield self.store.get_url_cache(url, ts) cache_result = yield self.store.get_url_cache(url, ts)
if ( if (
@ -141,32 +168,10 @@ class PreviewUrlResource(Resource):
cache_result["expires_ts"] > ts and cache_result["expires_ts"] > ts and
cache_result["response_code"] / 100 == 2 cache_result["response_code"] / 100 == 2
): ):
respond_with_json_bytes( defer.returnValue(cache_result["og"])
request, 200, cache_result["og"].encode('utf-8'),
send_cors=True
)
return return
# Ensure only one download for a given URL is active at a time media_info = yield self._download_url(url, user)
download = self.downloads.get(url)
if download is None:
download = self._download_url(url, requester.user)
download = ObservableDeferred(
download,
consumeErrors=True
)
self.downloads[url] = download
@download.addBoth
def callback(media_info):
del self.downloads[url]
return media_info
media_info = yield download.observe()
# FIXME: we should probably update our cache now anyway, so that
# even if the OG calculation raises, we don't keep hammering on the
# remote server. For now, leave it uncached to aid debugging OG
# calculation problems
logger.debug("got media_info of '%s'" % media_info) logger.debug("got media_info of '%s'" % media_info)
@ -212,7 +217,7 @@ class PreviewUrlResource(Resource):
# just rely on the caching on the master request to speed things up. # just rely on the caching on the master request to speed things up.
if 'og:image' in og and og['og:image']: if 'og:image' in og and og['og:image']:
image_info = yield self._download_url( image_info = yield self._download_url(
_rebase_url(og['og:image'], media_info['uri']), requester.user _rebase_url(og['og:image'], media_info['uri']), user
) )
if _is_media(image_info['media_type']): if _is_media(image_info['media_type']):
@ -239,8 +244,7 @@ class PreviewUrlResource(Resource):
logger.debug("Calculated OG for %s as %s" % (url, og)) logger.debug("Calculated OG for %s as %s" % (url, og))
# store OG in ephemeral in-memory cache jsonog = json.dumps(og)
self.cache[url] = og
# store OG in history-aware DB cache # store OG in history-aware DB cache
yield self.store.store_url_cache( yield self.store.store_url_cache(
@ -248,12 +252,12 @@ class PreviewUrlResource(Resource):
media_info["response_code"], media_info["response_code"],
media_info["etag"], media_info["etag"],
media_info["expires"] + media_info["created_ts"], media_info["expires"] + media_info["created_ts"],
json.dumps(og), jsonog,
media_info["filesystem_id"], media_info["filesystem_id"],
media_info["created_ts"], media_info["created_ts"],
) )
respond_with_json_bytes(request, 200, json.dumps(og), send_cors=True) defer.returnValue(jsonog)
@defer.inlineCallbacks @defer.inlineCallbacks
def _download_url(self, url, user): def _download_url(self, url, user):