support gzip compression, and don't pass through error msgs

This commit is contained in:
Matthew Hodgson 2016-04-02 03:06:39 +01:00
parent 9377157961
commit d1b154a10f
2 changed files with 11 additions and 5 deletions

View File

@ -23,7 +23,8 @@ from canonicaljson import encode_canonical_json
from twisted.internet import defer, reactor, ssl, protocol
from twisted.web.client import (
BrowserLikeRedirectAgent, Agent, readBody, FileBodyProducer, PartialDownloadError,
BrowserLikeRedirectAgent, ContentDecoderAgent, GzipDecoder, Agent,
readBody, FileBodyProducer, PartialDownloadError,
)
from twisted.web.http import PotentialDataLoss
from twisted.web.http_headers import Headers
@ -269,6 +270,10 @@ class SimpleHttpClient(object):
# XXX: do we want to explicitly drop the connection here somehow? if so, how?
raise # what should we be raising here?
if response.code > 299:
logger.warn("Got %d when downloading %s" % (response.code, url))
raise
# TODO: if our Content-Type is HTML or something, just read the first
# N bytes into RAM rather than saving it all to disk only to read it
# straight back in again
@ -366,11 +371,11 @@ class SpiderHttpClient(SimpleHttpClient):
def __init__(self, hs):
SimpleHttpClient.__init__(self, hs)
# clobber the base class's agent and UA:
self.agent = BrowserLikeRedirectAgent(Agent(
self.agent = ContentDecoderAgent(BrowserLikeRedirectAgent(Agent(
reactor,
connectTimeout=15,
contextFactory=hs.get_http_client_context_factory()
))
)), [('gzip', GzipDecoder)])
# Look like Chrome for now
#self.user_agent = ("Mozilla/5.0 (%s) (KHTML, like Gecko) Chrome Safari" % hs.version_string)

View File

@ -259,7 +259,8 @@ class PreviewUrlResource(BaseMediaResource):
length, headers, uri = yield self.client.get_file(
url, output_stream=f, max_size=self.max_spider_size,
)
# FIXME: handle 404s sanely - don't spider an error page
# FIXME: pass through 404s and other error messages nicely
media_type = headers["Content-Type"][0]
time_now_ms = self.clock.time_msec()