file removal background job

This commit is contained in:
jfriedli 2020-03-27 10:59:16 -07:00 committed by jvoisin
parent d50f68ae44
commit 7104b2107d
9 changed files with 196 additions and 19 deletions

View File

@ -52,7 +52,11 @@ Note that you can add multiple hosts from which you want to accept API requests.
a space.
**IMPORTANT:** The default value if the variable is not set is: `Access-Control-Allow-Origin: *`
Configure another environment variable: `MAT2_MAX_FILES_BULK_DOWNLOAD=10`
Configure the following environment variables:
- `MAT2_MAX_FILES_BULK_DOWNLOAD=10` Max number of files that can be grouped for a bulk download.
- `MAT2_MAX_FILE_AGE_FOR_REMOVAL=900` Seconds a file in the upload folder is kept.
After that it will be deleted. Default `15 * 60`
This specifies the max number of files that can be bulk downloaded using the api.
Note: Each file has a max file size of 16mb
@ -66,10 +70,6 @@ systemctl restart nginx/apache/…
It should now be working.
You should add `find /var/www/mat2-web/uploads/ -type f -mtime +1 -exec rm {} \;`
in a crontab to remove files that people might have uploaded but never
downloaded.
# Deploy via Ansible
If you happen to be using [Ansible](https://www.ansible.com/), there's an
@ -92,10 +92,6 @@ https://0xacab.org/jvoisin/mat2-web/container_registry
Example:
`docker run -p 80:80 -d -e MAT2_ALLOW_ORIGIN_WHITELIST='https://myhost1.org' registry.0xacab.org/jvoisin/mat2-web:latest`
Make sure to add
`find /var/www/mat2-web/uploads/ -type f -mtime +1 -exec rm {} \;` as cron job
run inside the container.
# Development
Install docker and docker-compose and then run `docker-compose up` to setup
the docker dev environment. Mat2-web is now accessible on your host machine at `localhost:5000`.

View File

@ -9,6 +9,7 @@ services:
- FLASK_ENV=development
- MAT2_ALLOW_ORIGIN_WHITELIST=*
- MAT2_MAX_FILES_BULK_DOWNLOAD=10
- MAT2_MAX_FILE_AGE_FOR_REMOVAL=60
ports:
- "5000:5000"
volumes:

26
file_removal_scheduler.py Normal file
View File

@ -0,0 +1,26 @@
import glob
import time
import sys
import os
import random
def run_file_removal_job(upload_folder_path):
if random.randint(0, 10) == 0:
for file in glob.glob(upload_folder_path + '/*'):
delete_file_when_too_old(file)
def delete_file_when_too_old(filepath):
file_mod_time = os.stat(filepath).st_mtime
# time in second since last modification of file
last_time = time.time() - file_mod_time
# if file is older than our configured max timeframe, delete it
if last_time > int(os.environ.get('MAT2_MAX_FILE_AGE_FOR_REMOVAL', 15 * 60)):
try:
os.remove(filepath)
except OSError:
print('Automatic File Removal failed on file: ' + str(filepath))
sys.exit(1)

20
main.py
View File

@ -10,6 +10,7 @@ import zipfile
from cerberus import Validator
import utils
import file_removal_scheduler
from libmat2 import parser_factory
from flask import Flask, flash, request, redirect, url_for, render_template, send_from_directory, after_this_request
from flask_restful import Resource, Api, reqparse, abort
@ -25,31 +26,36 @@ def create_app(test_config=None):
app.config['UPLOAD_FOLDER'] = './uploads/'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB
app.config['CUSTOM_TEMPLATES_DIR'] = 'custom_templates'
app.config.from_object('config') # optionally load settings from config.py
# optionally load settings from config.py
app.config.from_object('config')
if test_config is not None:
app.config.update(test_config)
app.jinja_loader = jinja2.ChoiceLoader([ # type: ignore
jinja2.FileSystemLoader(app.config['CUSTOM_TEMPLATES_DIR']),
app.jinja_loader,
])
])
api = Api(app)
CORS(app, resources={r"/api/*": {"origins": utils.get_allow_origin_header_value()}})
@app.route('/download/<string:key>/<string:filename>')
def download_file(key: str, filename:str):
def download_file(key: str, filename: str):
if filename != secure_filename(filename):
return redirect(url_for('upload_file'))
complete_path, filepath = get_file_paths(filename)
file_removal_scheduler.run_file_removal_job(app.config['UPLOAD_FOLDER'])
if not os.path.exists(complete_path):
return redirect(url_for('upload_file'))
if hmac.compare_digest(utils.hash_file(complete_path), key) is False:
return redirect(url_for('upload_file'))
@after_this_request
def remove_file(response):
os.remove(complete_path)
if os.path.exists(complete_path):
os.remove(complete_path)
return response
return send_from_directory(app.config['UPLOAD_FOLDER'], filepath, as_attachment=True)
@ -176,9 +182,11 @@ def create_app(test_config=None):
complete_path, filepath = is_valid_api_download_file(filename, key)
# Make sure the file is NOT deleted on HEAD requests
if request.method == 'GET':
file_removal_scheduler.run_file_removal_job(app.config['UPLOAD_FOLDER'])
@after_this_request
def remove_file(response):
os.remove(complete_path)
if os.path.exists(complete_path):
os.remove(complete_path)
return response
return send_from_directory(app.config['UPLOAD_FOLDER'], filepath, as_attachment=True)

View File

@ -5,4 +5,4 @@ mat2==0.9.0
flask==1.0.3
Flask-RESTful==0.3.7
Flask-Cors==3.0.8
Cerberus==1.3.1
Cerberus==1.3.1

View File

@ -14,7 +14,7 @@ mat2 <b>could not</b> remove all the metadata from <pre>{{ filename }}</pre>, th
</ul>
{%endif %}
</p>
<a class="button button-primary" download href='{{ url_for('download_file', key=key, filename=filename) }}'>⇩ Download cleaned file</a>
<a class="button button-primary" href='{{ url_for('download_file', key=key, filename=filename) }}'>⇩ Download cleaned file</a>
<hr/>

View File

@ -1,9 +1,12 @@
import base64
import unittest
import tempfile
import shutil
import io
import os
from unittest.mock import patch
import main
@ -62,6 +65,33 @@ class Mat2WebTestCase(unittest.TestCase):
rv.data)
self.assertEqual(rv.status_code, 200)
def test_get_upload_no_selected_file(self):
rv = self.app.post('/',
data=dict(
file=(io.BytesIO(b""), ''),
), follow_redirects=True)
self.assertIn(b'No selected file',
rv.data)
self.assertEqual(rv.status_code, 200)
def test_failed_cleaning(self):
zip_file_bytes = base64.b64decode(
'UEsDBBQACAAIAPicPE8AAAAAAAAAAAAAAAAXACAAZmFpbGluZy5ub3Qtd29ya2luZy1le'
'HRVVA0AB+Saj13kmo9d5JqPXXV4CwABBOkDAAAE6QMAAAMAUEsHCAAAAAACAAAAAAAAAFBL'
'AwQUAAgACAD6nDxPAAAAAAAAAAAAAAAACQAgAHRlc3QuanNvblVUDQAH6JqPXeiaj13omo9d'
'dXgLAAEE6QMAAATpAwAAAwBQSwcIAAAAAAIAAAAAAAAAUEsBAhQDFAAIAAgA+Jw8TwAAAAACA'
'AAAAAAAABcAIAAAAAAAAAAAAKSBAAAAAGZhaWxpbmcubm90LXdvcmtpbmctZXh0VVQNAAfkmo9'
'd5JqPXeSaj111eAsAAQTpAwAABOkDAABQSwECFAMUAAgACAD6nDxPAAAAAAIAAAAAAAAACQAgA'
'AAAAAAAAAAApIFnAAAAdGVzdC5qc29uVVQNAAfomo9d6JqPXeiaj111eAsAAQTpAwAABOkDAAB'
'QSwUGAAAAAAIAAgC8AAAAwAAAAAAA'
)
rv = self.app.post('/',
data=dict(
file=(io.BytesIO(zip_file_bytes), 'test.zip'),
), follow_redirects=True)
self.assertIn(b'Unable to clean',rv.data)
self.assertEqual(rv.status_code, 200)
def test_get_upload_no_file_name(self):
rv = self.app.post('/',
data=dict(
@ -97,6 +127,29 @@ class Mat2WebTestCase(unittest.TestCase):
rv = self.app.get('/download/70623619c449a040968cdbea85945bf384fa30ed2d5d24fa3/test.cleaned.txt')
self.assertEqual(rv.status_code, 302)
@patch('file_removal_scheduler.random.randint')
def test_upload_leftover(self, randint_mock):
randint_mock.return_value = 0
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '0'
app = main.create_app()
self.upload_folder = tempfile.mkdtemp()
app.config.update(
TESTING=True,
UPLOAD_FOLDER=self.upload_folder
)
app = app.test_client()
request = self.app.post('/',
data=dict(
file=(io.BytesIO(b"Some text"), 'test.txt'),
), follow_redirects=True)
self.assertEqual(request.status_code, 200)
request = app.get(
b'/download/4c2e9e6da31a64c70623619c449a040968cdbea85945bf384fa30ed2d5d24fa3/test.cleaned.txt'
)
self.assertEqual(302, request.status_code)
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '9999'
if __name__ == '__main__':
unittest.main()

View File

@ -4,9 +4,10 @@ import json
import os
import shutil
import zipfile
from six import BytesIO
from unittest.mock import patch
import main
@ -122,6 +123,23 @@ class Mat2APITestCase(unittest.TestCase):
rv = self.app.get('/api/extension', headers={'Origin': 'origin1.gnu'})
self.assertEqual(rv.headers['Access-Control-Allow-Origin'], 'origin1.gnu')
def test_api_cleaning_failed(self):
request = self.app.post('/api/upload',
data='{"file_name": "test_name.zip", '
'"file": "UEsDBBQACAAIAPicPE8AAAAAAAAAAAAAAAAXACAAZmFpbGluZy5ub3Qt'
'd29ya2luZy1leHRVVA0AB+Saj13kmo9d5JqPXXV4CwABBOkDAAAE6QMAAAMAUEsHCAAA'
'AAACAAAAAAAAAFBLAwQUAAgACAD6nDxPAAAAAAAAAAAAAAAACQAgAHRlc3QuanNvblVUD'
'QAH6JqPXeiaj13omo9ddXgLAAEE6QMAAATpAwAAAwBQSwcIAAAAAAIAAAAAAAAAUEsBAhQD'
'FAAIAAgA+Jw8TwAAAAACAAAAAAAAABcAIAAAAAAAAAAAAKSBAAAAAGZhaWxpbmcubm90LXd'
'vcmtpbmctZXh0VVQNAAfkmo9d5JqPXeSaj111eAsAAQTpAwAABOkDAABQSwECFAMUAAgACAD6'
'nDxPAAAAAAIAAAAAAAAACQAgAAAAAAAAAAAApIFnAAAAdGVzdC5qc29uVVQNAAfomo9d6JqPXe'
'iaj111eAsAAQTpAwAABOkDAABQSwUGAAAAAAIAAgC8AAAAwAAAAAAA"}',
headers={'content-type': 'application/json'}
)
error = json.loads(request.data.decode('utf-8'))['message']
self.assertEqual(error, 'Unable to clean application/zip')
def test_api_download(self):
request = self.app.post('/api/upload',
data='{"file_name": "test_name.jpg", '
@ -263,7 +281,6 @@ class Mat2APITestCase(unittest.TestCase):
)
response = json.loads(request.data.decode('utf-8'))
print(response)
self.assertEqual(response['message']['download_list'][0]['0'][0]['file_name'][0], 'required field')
self.assertEqual(response['message']['download_list'][0]['0'][0]['key'][0], 'required field')
self.assertEqual(request.status_code, 400)
@ -344,6 +361,34 @@ class Mat2APITestCase(unittest.TestCase):
response = json.loads(request.data.decode('utf-8'))
self.assertEqual('File not found', response['message'])
@patch('file_removal_scheduler.random.randint')
def test_api_upload_leftover(self, randint_mock):
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '0'
app = main.create_app()
self.upload_folder = tempfile.mkdtemp()
app.config.update(
TESTING=True,
UPLOAD_FOLDER=self.upload_folder
)
app = app.test_client()
randint_mock.return_value = 1
self.upload_download_test_jpg_and_assert_response_code(app, 200)
randint_mock.return_value = 0
self.upload_download_test_jpg_and_assert_response_code(app, 404)
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '9999'
def upload_download_test_jpg_and_assert_response_code(self, app, code):
request = app.post('/api/upload',
data='{"file_name": "test_name.jpg", '
'"file": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAf'
'FcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="}',
headers={'content-type': 'application/json'}
)
download_link = json.loads(request.data.decode('utf-8'))['download_link']
request = app.get(download_link)
self.assertEqual(code, request.status_code)
if __name__ == '__main__':
unittest.main()

View File

@ -0,0 +1,48 @@
import unittest
import tempfile
from os import path, environ
import shutil
import file_removal_scheduler
import main
class Mat2WebTestCase(unittest.TestCase):
def setUp(self):
self.upload_folder = tempfile.mkdtemp()
app = main.create_app()
app.config.update(
TESTING=True,
UPLOAD_FOLDER=self.upload_folder
)
self.app = app
def test_removal(self):
filename = 'test_name.cleaned.jpg'
environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '0'
open(path.join(self.upload_folder, filename), 'a').close()
self.assertTrue(path.exists(path.join(self.upload_folder, )))
for i in range(0, 11):
file_removal_scheduler.run_file_removal_job(self.app.config['UPLOAD_FOLDER'])
self.assertFalse(path.exists(path.join(self.upload_folder, filename)))
open(path.join(self.upload_folder, filename), 'a').close()
file_removal_scheduler.run_file_removal_job(self.app.config['UPLOAD_FOLDER'])
self.assertTrue(path.exists(path.join(self.upload_folder, )))
def test_non_removal(self):
filename = u'i_should_no_be_removed.txt'
environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '9999999'
open(path.join(self.upload_folder, filename), 'a').close()
self.assertTrue(path.exists(path.join(self.upload_folder, filename)))
for i in range(0, 11):
file_removal_scheduler.run_file_removal_job(self.app.config['UPLOAD_FOLDER'])
self.assertTrue(path.exists(path.join(self.upload_folder, filename)))
def tearDown(self):
shutil.rmtree(self.upload_folder)
if __name__ == '__main__':
unittest.main()