Reduce the memory usage of previewing media files. (#9421)

This reduces the memory usage of previewing media files which
end up larger than the `max_spider_size` by avoiding buffering
content internally in treq.

It also checks the `Content-Length` header in additional places
instead of streaming the content to check the body length.
This commit is contained in:
Patrick Cloke 2021-02-18 09:01:29 -05:00 committed by GitHub
parent bb2577f6b7
commit 8ec2217103
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 18 additions and 18 deletions

1
changelog.d/9421.bugfix Normal file
View File

@ -0,0 +1 @@
Reduce the amount of memory used when generating the URL preview of a file that is larger than the `max_spider_size`.

View File

@ -56,7 +56,7 @@ from twisted.web.client import (
) )
from twisted.web.http import PotentialDataLoss from twisted.web.http import PotentialDataLoss
from twisted.web.http_headers import Headers from twisted.web.http_headers import Headers
from twisted.web.iweb import IAgent, IBodyProducer, IResponse from twisted.web.iweb import UNKNOWN_LENGTH, IAgent, IBodyProducer, IResponse
from synapse.api.errors import Codes, HttpResponseException, SynapseError from synapse.api.errors import Codes, HttpResponseException, SynapseError
from synapse.http import QuieterFileBodyProducer, RequestTimedOutError, redact_uri from synapse.http import QuieterFileBodyProducer, RequestTimedOutError, redact_uri
@ -408,6 +408,9 @@ class SimpleHttpClient:
agent=self.agent, agent=self.agent,
data=body_producer, data=body_producer,
headers=headers, headers=headers,
# Avoid buffering the body in treq since we do not reuse
# response bodies.
unbuffered=True,
**self._extra_treq_args, **self._extra_treq_args,
) # type: defer.Deferred ) # type: defer.Deferred
@ -702,18 +705,6 @@ class SimpleHttpClient:
resp_headers = dict(response.headers.getAllRawHeaders()) resp_headers = dict(response.headers.getAllRawHeaders())
if (
b"Content-Length" in resp_headers
and max_size
and int(resp_headers[b"Content-Length"][0]) > max_size
):
logger.warning("Requested URL is too large > %r bytes" % (max_size,))
raise SynapseError(
502,
"Requested file is too large > %r bytes" % (max_size,),
Codes.TOO_LARGE,
)
if response.code > 299: if response.code > 299:
logger.warning("Got %d when downloading %s" % (response.code, url)) logger.warning("Got %d when downloading %s" % (response.code, url))
raise SynapseError(502, "Got error %d" % (response.code,), Codes.UNKNOWN) raise SynapseError(502, "Got error %d" % (response.code,), Codes.UNKNOWN)
@ -780,7 +771,9 @@ class _ReadBodyWithMaxSizeProtocol(protocol.Protocol):
# in the meantime. # in the meantime.
if self.max_size is not None and self.length >= self.max_size: if self.max_size is not None and self.length >= self.max_size:
self.deferred.errback(BodyExceededMaxSize()) self.deferred.errback(BodyExceededMaxSize())
self.transport.loseConnection() # Close the connection (forcefully) since all the data will get
# discarded anyway.
self.transport.abortConnection()
def connectionLost(self, reason: Failure) -> None: def connectionLost(self, reason: Failure) -> None:
# If the maximum size was already exceeded, there's nothing to do. # If the maximum size was already exceeded, there's nothing to do.
@ -814,6 +807,11 @@ def read_body_with_max_size(
Returns: Returns:
A Deferred which resolves to the length of the read body. A Deferred which resolves to the length of the read body.
""" """
# If the Content-Length header gives a size larger than the maximum allowed
# size, do not bother downloading the body.
if max_size is not None and response.length != UNKNOWN_LENGTH:
if response.length > max_size:
return defer.fail(BodyExceededMaxSize())
d = defer.Deferred() d = defer.Deferred()
response.deliverBody(_ReadBodyWithMaxSizeProtocol(stream, d, max_size)) response.deliverBody(_ReadBodyWithMaxSizeProtocol(stream, d, max_size))

View File

@ -18,6 +18,7 @@ from mock import Mock
from twisted.python.failure import Failure from twisted.python.failure import Failure
from twisted.web.client import ResponseDone from twisted.web.client import ResponseDone
from twisted.web.iweb import UNKNOWN_LENGTH
from synapse.http.client import BodyExceededMaxSize, read_body_with_max_size from synapse.http.client import BodyExceededMaxSize, read_body_with_max_size
@ -27,12 +28,12 @@ from tests.unittest import TestCase
class ReadBodyWithMaxSizeTests(TestCase): class ReadBodyWithMaxSizeTests(TestCase):
def setUp(self): def setUp(self):
"""Start reading the body, returns the response, result and proto""" """Start reading the body, returns the response, result and proto"""
self.response = Mock() response = Mock(length=UNKNOWN_LENGTH)
self.result = BytesIO() self.result = BytesIO()
self.deferred = read_body_with_max_size(self.response, self.result, 6) self.deferred = read_body_with_max_size(response, self.result, 6)
# Fish the protocol out of the response. # Fish the protocol out of the response.
self.protocol = self.response.deliverBody.call_args[0][0] self.protocol = response.deliverBody.call_args[0][0]
self.protocol.transport = Mock() self.protocol.transport = Mock()
def _cleanup_error(self): def _cleanup_error(self):
@ -88,7 +89,7 @@ class ReadBodyWithMaxSizeTests(TestCase):
self.protocol.dataReceived(b"1234567890") self.protocol.dataReceived(b"1234567890")
self.assertIsInstance(self.deferred.result, Failure) self.assertIsInstance(self.deferred.result, Failure)
self.assertIsInstance(self.deferred.result.value, BodyExceededMaxSize) self.assertIsInstance(self.deferred.result.value, BodyExceededMaxSize)
self.protocol.transport.loseConnection.assert_called_once() self.protocol.transport.abortConnection.assert_called_once()
# More data might have come in. # More data might have come in.
self.protocol.dataReceived(b"1234567890") self.protocol.dataReceived(b"1234567890")