mirror of
https://git.anonymousland.org/anonymousland/synapse.git
synced 2025-08-03 03:26:21 -04:00
Merge branch 'erikj/media_spam_checker' into develop
This commit is contained in:
commit
adc96d4236
6 changed files with 210 additions and 6 deletions
|
@ -17,6 +17,8 @@
|
|||
import inspect
|
||||
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from synapse.rest.media.v1._base import FileInfo
|
||||
from synapse.rest.media.v1.media_storage import ReadableFileWrapper
|
||||
from synapse.spam_checker_api import RegistrationBehaviour
|
||||
from synapse.types import Collection
|
||||
from synapse.util.async_helpers import maybe_awaitable
|
||||
|
@ -214,3 +216,48 @@ class SpamChecker:
|
|||
return behaviour
|
||||
|
||||
return RegistrationBehaviour.ALLOW
|
||||
|
||||
async def check_media_file_for_spam(
|
||||
self, file_wrapper: ReadableFileWrapper, file_info: FileInfo
|
||||
) -> bool:
|
||||
"""Checks if a piece of newly uploaded media should be blocked.
|
||||
|
||||
This will be called for local uploads, downloads of remote media, each
|
||||
thumbnail generated for those, and web pages/images used for URL
|
||||
previews.
|
||||
|
||||
Note that care should be taken to not do blocking IO operations in the
|
||||
main thread. For example, to get the contents of a file a module
|
||||
should do::
|
||||
|
||||
async def check_media_file_for_spam(
|
||||
self, file: ReadableFileWrapper, file_info: FileInfo
|
||||
) -> bool:
|
||||
buffer = BytesIO()
|
||||
await file.write_chunks_to(buffer.write)
|
||||
|
||||
if buffer.getvalue() == b"Hello World":
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
Args:
|
||||
file: An object that allows reading the contents of the media.
|
||||
file_info: Metadata about the file.
|
||||
|
||||
Returns:
|
||||
True if the media should be blocked or False if it should be
|
||||
allowed.
|
||||
"""
|
||||
|
||||
for spam_checker in self.spam_checkers:
|
||||
# For backwards compatibility, only run if the method exists on the
|
||||
# spam checker
|
||||
checker = getattr(spam_checker, "check_media_file_for_spam", None)
|
||||
if checker:
|
||||
spam = await maybe_awaitable(checker(file_wrapper, file_info))
|
||||
if spam:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
|
|
@ -16,13 +16,17 @@ import contextlib
|
|||
import logging
|
||||
import os
|
||||
import shutil
|
||||
from typing import IO, TYPE_CHECKING, Any, Optional, Sequence
|
||||
from typing import IO, TYPE_CHECKING, Any, Callable, Optional, Sequence
|
||||
|
||||
import attr
|
||||
|
||||
from twisted.internet.defer import Deferred
|
||||
from twisted.internet.interfaces import IConsumer
|
||||
from twisted.protocols.basic import FileSender
|
||||
|
||||
from synapse.api.errors import NotFoundError
|
||||
from synapse.logging.context import defer_to_thread, make_deferred_yieldable
|
||||
from synapse.util import Clock
|
||||
from synapse.util.file_consumer import BackgroundFileConsumer
|
||||
|
||||
from ._base import FileInfo, Responder
|
||||
|
@ -58,6 +62,8 @@ class MediaStorage:
|
|||
self.local_media_directory = local_media_directory
|
||||
self.filepaths = filepaths
|
||||
self.storage_providers = storage_providers
|
||||
self.spam_checker = hs.get_spam_checker()
|
||||
self.clock = hs.get_clock()
|
||||
|
||||
async def store_file(self, source: IO, file_info: FileInfo) -> str:
|
||||
"""Write `source` to the on disk media store, and also any other
|
||||
|
@ -127,18 +133,29 @@ class MediaStorage:
|
|||
f.flush()
|
||||
f.close()
|
||||
|
||||
spam = await self.spam_checker.check_media_file_for_spam(
|
||||
ReadableFileWrapper(self.clock, fname), file_info
|
||||
)
|
||||
if spam:
|
||||
logger.info("Blocking media due to spam checker")
|
||||
# Note that we'll delete the stored media, due to the
|
||||
# try/except below. The media also won't be stored in
|
||||
# the DB.
|
||||
raise SpamMediaException()
|
||||
|
||||
for provider in self.storage_providers:
|
||||
await provider.store_file(path, file_info)
|
||||
|
||||
finished_called[0] = True
|
||||
|
||||
yield f, fname, finish
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
try:
|
||||
os.remove(fname)
|
||||
except Exception:
|
||||
pass
|
||||
raise
|
||||
|
||||
raise e from None
|
||||
|
||||
if not finished_called:
|
||||
raise Exception("Finished callback not called")
|
||||
|
@ -302,3 +319,39 @@ class FileResponder(Responder):
|
|||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.open_file.close()
|
||||
|
||||
|
||||
class SpamMediaException(NotFoundError):
|
||||
"""The media was blocked by a spam checker, so we simply 404 the request (in
|
||||
the same way as if it was quarantined).
|
||||
"""
|
||||
|
||||
|
||||
@attr.s(slots=True)
|
||||
class ReadableFileWrapper:
|
||||
"""Wrapper that allows reading a file in chunks, yielding to the reactor,
|
||||
and writing to a callback.
|
||||
|
||||
This is simplified `FileSender` that takes an IO object rather than an
|
||||
`IConsumer`.
|
||||
"""
|
||||
|
||||
CHUNK_SIZE = 2 ** 14
|
||||
|
||||
clock = attr.ib(type=Clock)
|
||||
path = attr.ib(type=str)
|
||||
|
||||
async def write_chunks_to(self, callback: Callable[[bytes], None]):
|
||||
"""Reads the file in chunks and calls the callback with each chunk.
|
||||
"""
|
||||
|
||||
with open(self.path, "rb") as file:
|
||||
while True:
|
||||
chunk = file.read(self.CHUNK_SIZE)
|
||||
if not chunk:
|
||||
break
|
||||
|
||||
callback(chunk)
|
||||
|
||||
# We yield to the reactor by sleeping for 0 seconds.
|
||||
await self.clock.sleep(0)
|
||||
|
|
|
@ -22,6 +22,7 @@ from twisted.web.http import Request
|
|||
from synapse.api.errors import Codes, SynapseError
|
||||
from synapse.http.server import DirectServeJsonResource, respond_with_json
|
||||
from synapse.http.servlet import parse_string
|
||||
from synapse.rest.media.v1.media_storage import SpamMediaException
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from synapse.app.homeserver import HomeServer
|
||||
|
@ -86,9 +87,14 @@ class UploadResource(DirectServeJsonResource):
|
|||
# disposition = headers.getRawHeaders(b"Content-Disposition")[0]
|
||||
# TODO(markjh): parse content-dispostion
|
||||
|
||||
content_uri = await self.media_repo.create_content(
|
||||
media_type, upload_name, request.content, content_length, requester.user
|
||||
)
|
||||
try:
|
||||
content_uri = await self.media_repo.create_content(
|
||||
media_type, upload_name, request.content, content_length, requester.user
|
||||
)
|
||||
except SpamMediaException:
|
||||
# For uploading of media we want to respond with a 400, instead of
|
||||
# the default 404, as that would just be confusing.
|
||||
raise SynapseError(400, "Bad content")
|
||||
|
||||
logger.info("Uploaded content with URI %r", content_uri)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue