handle requests with missing content-length headers (e.g. YouTube)

This commit is contained in:
Matthew Hodgson 2016-03-31 01:55:21 +01:00
parent 7178ab7da0
commit a8a5dd3b44
2 changed files with 28 additions and 9 deletions

View File

@ -23,8 +23,9 @@ from canonicaljson import encode_canonical_json
from twisted.internet import defer, reactor, ssl, protocol from twisted.internet import defer, reactor, ssl, protocol
from twisted.web.client import ( from twisted.web.client import (
RedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError, BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError,
) )
from twisted.web.http import PotentialDataLoss
from twisted.web.http_headers import Headers from twisted.web.http_headers import Headers
from twisted.web._newclient import ResponseDone from twisted.web._newclient import ResponseDone
@ -59,11 +60,11 @@ class SimpleHttpClient(object):
# The default context factory in Twisted 14.0.0 (which we require) is # The default context factory in Twisted 14.0.0 (which we require) is
# BrowserLikePolicyForHTTPS which will do regular cert validation # BrowserLikePolicyForHTTPS which will do regular cert validation
# 'like a browser' # 'like a browser'
self.agent = RedirectAgent(Agent( self.agent = Agent(
reactor, reactor,
connectTimeout=15, connectTimeout=15,
contextFactory=hs.get_http_client_context_factory() contextFactory=hs.get_http_client_context_factory()
)) )
self.user_agent = hs.version_string self.user_agent = hs.version_string
if hs.config.user_agent_suffix: if hs.config.user_agent_suffix:
self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,) self.user_agent = "%s %s" % (self.user_agent, hs.config.user_agent_suffix,)
@ -253,10 +254,6 @@ class SimpleHttpClient(object):
headers. headers.
""" """
def body_callback(method, url_bytes, headers_dict):
self.sign_request(destination, method, url_bytes, headers_dict)
return None
response = yield self.request( response = yield self.request(
"GET", "GET",
url.encode("ascii"), url.encode("ascii"),
@ -309,6 +306,10 @@ class _ReadBodyToFileProtocol(protocol.Protocol):
def connectionLost(self, reason): def connectionLost(self, reason):
if reason.check(ResponseDone): if reason.check(ResponseDone):
self.deferred.callback(self.length) self.deferred.callback(self.length)
elif reason.check(PotentialDataLoss):
# stolen from https://github.com/twisted/treq/pull/49/files
# http://twistedmatrix.com/trac/ticket/4840
self.deferred.callback(self.length)
else: else:
self.deferred.errback(reason) self.deferred.errback(reason)
@ -350,6 +351,24 @@ class CaptchaServerHttpClient(SimpleHttpClient):
# twisted dislikes google's response, no content length. # twisted dislikes google's response, no content length.
defer.returnValue(e.response) defer.returnValue(e.response)
class SpiderHttpClient(SimpleHttpClient):
"""
Separate HTTP client for spidering arbitrary URLs.
Special in that it follows retries and has a UA that looks
like a browser.
used by the preview_url endpoint in the content repo.
"""
def __init__(self, hs):
SimpleHttpClient.__init__(self, hs)
# clobber the base class's agent and UA:
self.agent = BrowserLikeRedirectAgent(Agent(
reactor,
connectTimeout=15,
contextFactory=hs.get_http_client_context_factory()
))
# Look like Chrome for now
#self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string)
def encode_urlencode_args(args): def encode_urlencode_args(args):
return {k: encode_urlencode_arg(v) for k, v in args.items()} return {k: encode_urlencode_arg(v) for k, v in args.items()}

View File

@ -19,7 +19,7 @@ from twisted.web.server import NOT_DONE_YET
from twisted.internet import defer from twisted.internet import defer
from lxml import html from lxml import html
from synapse.util.stringutils import random_string from synapse.util.stringutils import random_string
from synapse.http.client import SimpleHttpClient from synapse.http.client import SpiderHttpClient
from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes from synapse.http.server import request_handler, respond_with_json, respond_with_json_bytes
import os import os
@ -33,7 +33,7 @@ class PreviewUrlResource(BaseMediaResource):
def __init__(self, hs, filepaths): def __init__(self, hs, filepaths):
BaseMediaResource.__init__(self, hs, filepaths) BaseMediaResource.__init__(self, hs, filepaths)
self.client = SimpleHttpClient(hs) self.client = SpiderHttpClient(hs)
def render_GET(self, request): def render_GET(self, request):
self._async_render_GET(request) self._async_render_GET(request)