From a8a5dd3b44a4526307502bd621ee0bd43c87c77f Mon Sep 17 00:00:00 2001 From: Matthew Hodgson Date: Thu, 31 Mar 2016 01:55:21 +0100 Subject: [PATCH] handle requests with missing content-length headers (e.g. YouTube) --- synapse/http/client.py | 33 +++++++++++++++---- synapse/rest/media/v1/preview_url_resource.py | 4 +-- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/synapse/http/client.py b/synapse/http/client.py index 71b2e3375..30f31a915 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -23,8 +23,9 @@ from canonicaljson import encode_canonical_json from twisted.internet import defer, reactor, ssl, protocol from twisted.web.client import ( - RedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, + BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, ) +from twisted.web.http import PotentialDataLoss from twisted.web.http_headers import Headers from twisted.web._newclient import ResponseDone @@ -59,11 +60,11 @@ class SimpleHttpClient(object): # The default context factory in Twisted 14.0.0 (which we require) is # BrowserLikePolicyForHTTPS which will do regular cert validation # 'like a browser' - self.agent = RedirectAgent(Agent( + self.agent = Agent( reactor, connectTimeout=15, contextFactory=hs.get_http_client_context_factory() - )) + ) self.user_agent = hs.version_string if hs.config.user_agent_suffix: self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,) @@ -253,10 +254,6 @@ class SimpleHttpClient(object): headers. """ - def body_callback(method, url_bytes, headers_dict): - self.sign_request(destination, method, url_bytes, headers_dict) - return None - response = yield self.request( "GET", url.encode("ascii"), @@ -309,6 +306,10 @@ class _ReadBodyToFileProtocol(protocol.Protocol): def connectionLost(self, reason): if reason.check(ResponseDone): self.deferred.callback(self.length) + elif reason.check(PotentialDataLoss): + # stolen from https://github.com/twisted/treq/pull/49/files + # http://twistedmatrix.com/trac/ticket/4840 + self.deferred.callback(self.length) else: self.deferred.errback(reason) @@ -350,6 +351,24 @@ class CaptchaServerHttpClient(SimpleHttpClient): # twisted dislikes google's response, no content length. defer.returnValue(e.response) +class SpiderHttpClient(SimpleHttpClient): + """ + Separate HTTP client for spidering arbitrary URLs. + Special in that it follows retries and has a UA that looks + like a browser. + + used by the preview_url endpoint in the content repo. + """ + def __init__(self, hs): + SimpleHttpClient.__init__(self, hs) + # clobber the base class's agent and UA: + self.agent = BrowserLikeRedirectAgent(Agent( + reactor, + connectTimeout=15, + contextFactory=hs.get_http_client_context_factory() + )) + # Look like Chrome for now + #self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string) def encode_urlencode_args(args): return {k: encode_urlencode_arg(v) for k, v in args.items()} diff --git a/synapse/rest/media/v1/preview_url_resource.py b/synapse/rest/media/v1/preview_url_resource.py index b999944e8..ca2529cc1 100644 --- a/synapse/rest/media/v1/preview_url_resource.py +++ b/synapse/rest/media/v1/preview_url_resource.py @@ -19,7 +19,7 @@ from twisted.web.server import NOT_DONE_YET from twisted.internet import defer from lxml import html from synapse.util.stringutils import random_string -from synapse.http.client import SimpleHttpClient +from synapse.http.client import SpiderHttpClient from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes import os @@ -33,7 +33,7 @@ class PreviewUrlResource(BaseMediaResource): def __init__(self, hs, filepaths): BaseMediaResource.__init__(self, hs, filepaths) - self.client = SimpleHttpClient(hs) + self.client = SpiderHttpClient(hs) def render_GET(self, request): self._async_render_GET(request)