file removal background job

This commit is contained in:
jfriedli 2020-03-27 10:59:16 -07:00 committed by jvoisin
parent d50f68ae44
commit 7104b2107d
9 changed files with 196 additions and 19 deletions

View File

@ -52,7 +52,11 @@ Note that you can add multiple hosts from which you want to accept API requests.
a space. a space.
**IMPORTANT:** The default value if the variable is not set is: `Access-Control-Allow-Origin: *` **IMPORTANT:** The default value if the variable is not set is: `Access-Control-Allow-Origin: *`
Configure another environment variable: `MAT2_MAX_FILES_BULK_DOWNLOAD=10` Configure the following environment variables:
- `MAT2_MAX_FILES_BULK_DOWNLOAD=10` Max number of files that can be grouped for a bulk download.
- `MAT2_MAX_FILE_AGE_FOR_REMOVAL=900` Seconds a file in the upload folder is kept.
After that it will be deleted. Default `15 * 60`
This specifies the max number of files that can be bulk downloaded using the api. This specifies the max number of files that can be bulk downloaded using the api.
Note: Each file has a max file size of 16mb Note: Each file has a max file size of 16mb
@ -66,10 +70,6 @@ systemctl restart nginx/apache/…
It should now be working. It should now be working.
You should add `find /var/www/mat2-web/uploads/ -type f -mtime +1 -exec rm {} \;`
in a crontab to remove files that people might have uploaded but never
downloaded.
# Deploy via Ansible # Deploy via Ansible
If you happen to be using [Ansible](https://www.ansible.com/), there's an If you happen to be using [Ansible](https://www.ansible.com/), there's an
@ -92,10 +92,6 @@ https://0xacab.org/jvoisin/mat2-web/container_registry
Example: Example:
`docker run -p 80:80 -d -e MAT2_ALLOW_ORIGIN_WHITELIST='https://myhost1.org' registry.0xacab.org/jvoisin/mat2-web:latest` `docker run -p 80:80 -d -e MAT2_ALLOW_ORIGIN_WHITELIST='https://myhost1.org' registry.0xacab.org/jvoisin/mat2-web:latest`
Make sure to add
`find /var/www/mat2-web/uploads/ -type f -mtime +1 -exec rm {} \;` as cron job
run inside the container.
# Development # Development
Install docker and docker-compose and then run `docker-compose up` to setup Install docker and docker-compose and then run `docker-compose up` to setup
the docker dev environment. Mat2-web is now accessible on your host machine at `localhost:5000`. the docker dev environment. Mat2-web is now accessible on your host machine at `localhost:5000`.

View File

@ -9,6 +9,7 @@ services:
- FLASK_ENV=development - FLASK_ENV=development
- MAT2_ALLOW_ORIGIN_WHITELIST=* - MAT2_ALLOW_ORIGIN_WHITELIST=*
- MAT2_MAX_FILES_BULK_DOWNLOAD=10 - MAT2_MAX_FILES_BULK_DOWNLOAD=10
- MAT2_MAX_FILE_AGE_FOR_REMOVAL=60
ports: ports:
- "5000:5000" - "5000:5000"
volumes: volumes:

26
file_removal_scheduler.py Normal file
View File

@ -0,0 +1,26 @@
import glob
import time
import sys
import os
import random
def run_file_removal_job(upload_folder_path):
if random.randint(0, 10) == 0:
for file in glob.glob(upload_folder_path + '/*'):
delete_file_when_too_old(file)
def delete_file_when_too_old(filepath):
file_mod_time = os.stat(filepath).st_mtime
# time in second since last modification of file
last_time = time.time() - file_mod_time
# if file is older than our configured max timeframe, delete it
if last_time > int(os.environ.get('MAT2_MAX_FILE_AGE_FOR_REMOVAL', 15 * 60)):
try:
os.remove(filepath)
except OSError:
print('Automatic File Removal failed on file: ' + str(filepath))
sys.exit(1)

20
main.py
View File

@ -10,6 +10,7 @@ import zipfile
from cerberus import Validator from cerberus import Validator
import utils import utils
import file_removal_scheduler
from libmat2 import parser_factory from libmat2 import parser_factory
from flask import Flask, flash, request, redirect, url_for, render_template, send_from_directory, after_this_request from flask import Flask, flash, request, redirect, url_for, render_template, send_from_directory, after_this_request
from flask_restful import Resource, Api, reqparse, abort from flask_restful import Resource, Api, reqparse, abort
@ -25,31 +26,36 @@ def create_app(test_config=None):
app.config['UPLOAD_FOLDER'] = './uploads/' app.config['UPLOAD_FOLDER'] = './uploads/'
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB
app.config['CUSTOM_TEMPLATES_DIR'] = 'custom_templates' app.config['CUSTOM_TEMPLATES_DIR'] = 'custom_templates'
app.config.from_object('config') # optionally load settings from config.py # optionally load settings from config.py
app.config.from_object('config')
if test_config is not None:
app.config.update(test_config)
app.jinja_loader = jinja2.ChoiceLoader([ # type: ignore app.jinja_loader = jinja2.ChoiceLoader([ # type: ignore
jinja2.FileSystemLoader(app.config['CUSTOM_TEMPLATES_DIR']), jinja2.FileSystemLoader(app.config['CUSTOM_TEMPLATES_DIR']),
app.jinja_loader, app.jinja_loader,
]) ])
api = Api(app) api = Api(app)
CORS(app, resources={r"/api/*": {"origins": utils.get_allow_origin_header_value()}}) CORS(app, resources={r"/api/*": {"origins": utils.get_allow_origin_header_value()}})
@app.route('/download/<string:key>/<string:filename>') @app.route('/download/<string:key>/<string:filename>')
def download_file(key: str, filename:str): def download_file(key: str, filename: str):
if filename != secure_filename(filename): if filename != secure_filename(filename):
return redirect(url_for('upload_file')) return redirect(url_for('upload_file'))
complete_path, filepath = get_file_paths(filename) complete_path, filepath = get_file_paths(filename)
file_removal_scheduler.run_file_removal_job(app.config['UPLOAD_FOLDER'])
if not os.path.exists(complete_path): if not os.path.exists(complete_path):
return redirect(url_for('upload_file')) return redirect(url_for('upload_file'))
if hmac.compare_digest(utils.hash_file(complete_path), key) is False: if hmac.compare_digest(utils.hash_file(complete_path), key) is False:
return redirect(url_for('upload_file')) return redirect(url_for('upload_file'))
@after_this_request @after_this_request
def remove_file(response): def remove_file(response):
os.remove(complete_path) if os.path.exists(complete_path):
os.remove(complete_path)
return response return response
return send_from_directory(app.config['UPLOAD_FOLDER'], filepath, as_attachment=True) return send_from_directory(app.config['UPLOAD_FOLDER'], filepath, as_attachment=True)
@ -176,9 +182,11 @@ def create_app(test_config=None):
complete_path, filepath = is_valid_api_download_file(filename, key) complete_path, filepath = is_valid_api_download_file(filename, key)
# Make sure the file is NOT deleted on HEAD requests # Make sure the file is NOT deleted on HEAD requests
if request.method == 'GET': if request.method == 'GET':
file_removal_scheduler.run_file_removal_job(app.config['UPLOAD_FOLDER'])
@after_this_request @after_this_request
def remove_file(response): def remove_file(response):
os.remove(complete_path) if os.path.exists(complete_path):
os.remove(complete_path)
return response return response
return send_from_directory(app.config['UPLOAD_FOLDER'], filepath, as_attachment=True) return send_from_directory(app.config['UPLOAD_FOLDER'], filepath, as_attachment=True)

View File

@ -5,4 +5,4 @@ mat2==0.9.0
flask==1.0.3 flask==1.0.3
Flask-RESTful==0.3.7 Flask-RESTful==0.3.7
Flask-Cors==3.0.8 Flask-Cors==3.0.8
Cerberus==1.3.1 Cerberus==1.3.1

View File

@ -14,7 +14,7 @@ mat2 <b>could not</b> remove all the metadata from <pre>{{ filename }}</pre>, th
</ul> </ul>
{%endif %} {%endif %}
</p> </p>
<a class="button button-primary" download href='{{ url_for('download_file', key=key, filename=filename) }}'>⇩ Download cleaned file</a> <a class="button button-primary" href='{{ url_for('download_file', key=key, filename=filename) }}'>⇩ Download cleaned file</a>
<hr/> <hr/>

View File

@ -1,9 +1,12 @@
import base64
import unittest import unittest
import tempfile import tempfile
import shutil import shutil
import io import io
import os import os
from unittest.mock import patch
import main import main
@ -62,6 +65,33 @@ class Mat2WebTestCase(unittest.TestCase):
rv.data) rv.data)
self.assertEqual(rv.status_code, 200) self.assertEqual(rv.status_code, 200)
def test_get_upload_no_selected_file(self):
rv = self.app.post('/',
data=dict(
file=(io.BytesIO(b""), ''),
), follow_redirects=True)
self.assertIn(b'No selected file',
rv.data)
self.assertEqual(rv.status_code, 200)
def test_failed_cleaning(self):
zip_file_bytes = base64.b64decode(
'UEsDBBQACAAIAPicPE8AAAAAAAAAAAAAAAAXACAAZmFpbGluZy5ub3Qtd29ya2luZy1le'
'HRVVA0AB+Saj13kmo9d5JqPXXV4CwABBOkDAAAE6QMAAAMAUEsHCAAAAAACAAAAAAAAAFBL'
'AwQUAAgACAD6nDxPAAAAAAAAAAAAAAAACQAgAHRlc3QuanNvblVUDQAH6JqPXeiaj13omo9d'
'dXgLAAEE6QMAAATpAwAAAwBQSwcIAAAAAAIAAAAAAAAAUEsBAhQDFAAIAAgA+Jw8TwAAAAACA'
'AAAAAAAABcAIAAAAAAAAAAAAKSBAAAAAGZhaWxpbmcubm90LXdvcmtpbmctZXh0VVQNAAfkmo9'
'd5JqPXeSaj111eAsAAQTpAwAABOkDAABQSwECFAMUAAgACAD6nDxPAAAAAAIAAAAAAAAACQAgA'
'AAAAAAAAAAApIFnAAAAdGVzdC5qc29uVVQNAAfomo9d6JqPXeiaj111eAsAAQTpAwAABOkDAAB'
'QSwUGAAAAAAIAAgC8AAAAwAAAAAAA'
)
rv = self.app.post('/',
data=dict(
file=(io.BytesIO(zip_file_bytes), 'test.zip'),
), follow_redirects=True)
self.assertIn(b'Unable to clean',rv.data)
self.assertEqual(rv.status_code, 200)
def test_get_upload_no_file_name(self): def test_get_upload_no_file_name(self):
rv = self.app.post('/', rv = self.app.post('/',
data=dict( data=dict(
@ -97,6 +127,29 @@ class Mat2WebTestCase(unittest.TestCase):
rv = self.app.get('/download/70623619c449a040968cdbea85945bf384fa30ed2d5d24fa3/test.cleaned.txt') rv = self.app.get('/download/70623619c449a040968cdbea85945bf384fa30ed2d5d24fa3/test.cleaned.txt')
self.assertEqual(rv.status_code, 302) self.assertEqual(rv.status_code, 302)
@patch('file_removal_scheduler.random.randint')
def test_upload_leftover(self, randint_mock):
randint_mock.return_value = 0
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '0'
app = main.create_app()
self.upload_folder = tempfile.mkdtemp()
app.config.update(
TESTING=True,
UPLOAD_FOLDER=self.upload_folder
)
app = app.test_client()
request = self.app.post('/',
data=dict(
file=(io.BytesIO(b"Some text"), 'test.txt'),
), follow_redirects=True)
self.assertEqual(request.status_code, 200)
request = app.get(
b'/download/4c2e9e6da31a64c70623619c449a040968cdbea85945bf384fa30ed2d5d24fa3/test.cleaned.txt'
)
self.assertEqual(302, request.status_code)
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '9999'
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -4,9 +4,10 @@ import json
import os import os
import shutil import shutil
import zipfile import zipfile
from six import BytesIO from six import BytesIO
from unittest.mock import patch
import main import main
@ -122,6 +123,23 @@ class Mat2APITestCase(unittest.TestCase):
rv = self.app.get('/api/extension', headers={'Origin': 'origin1.gnu'}) rv = self.app.get('/api/extension', headers={'Origin': 'origin1.gnu'})
self.assertEqual(rv.headers['Access-Control-Allow-Origin'], 'origin1.gnu') self.assertEqual(rv.headers['Access-Control-Allow-Origin'], 'origin1.gnu')
def test_api_cleaning_failed(self):
request = self.app.post('/api/upload',
data='{"file_name": "test_name.zip", '
'"file": "UEsDBBQACAAIAPicPE8AAAAAAAAAAAAAAAAXACAAZmFpbGluZy5ub3Qt'
'd29ya2luZy1leHRVVA0AB+Saj13kmo9d5JqPXXV4CwABBOkDAAAE6QMAAAMAUEsHCAAA'
'AAACAAAAAAAAAFBLAwQUAAgACAD6nDxPAAAAAAAAAAAAAAAACQAgAHRlc3QuanNvblVUD'
'QAH6JqPXeiaj13omo9ddXgLAAEE6QMAAATpAwAAAwBQSwcIAAAAAAIAAAAAAAAAUEsBAhQD'
'FAAIAAgA+Jw8TwAAAAACAAAAAAAAABcAIAAAAAAAAAAAAKSBAAAAAGZhaWxpbmcubm90LXd'
'vcmtpbmctZXh0VVQNAAfkmo9d5JqPXeSaj111eAsAAQTpAwAABOkDAABQSwECFAMUAAgACAD6'
'nDxPAAAAAAIAAAAAAAAACQAgAAAAAAAAAAAApIFnAAAAdGVzdC5qc29uVVQNAAfomo9d6JqPXe'
'iaj111eAsAAQTpAwAABOkDAABQSwUGAAAAAAIAAgC8AAAAwAAAAAAA"}',
headers={'content-type': 'application/json'}
)
error = json.loads(request.data.decode('utf-8'))['message']
self.assertEqual(error, 'Unable to clean application/zip')
def test_api_download(self): def test_api_download(self):
request = self.app.post('/api/upload', request = self.app.post('/api/upload',
data='{"file_name": "test_name.jpg", ' data='{"file_name": "test_name.jpg", '
@ -263,7 +281,6 @@ class Mat2APITestCase(unittest.TestCase):
) )
response = json.loads(request.data.decode('utf-8')) response = json.loads(request.data.decode('utf-8'))
print(response)
self.assertEqual(response['message']['download_list'][0]['0'][0]['file_name'][0], 'required field') self.assertEqual(response['message']['download_list'][0]['0'][0]['file_name'][0], 'required field')
self.assertEqual(response['message']['download_list'][0]['0'][0]['key'][0], 'required field') self.assertEqual(response['message']['download_list'][0]['0'][0]['key'][0], 'required field')
self.assertEqual(request.status_code, 400) self.assertEqual(request.status_code, 400)
@ -344,6 +361,34 @@ class Mat2APITestCase(unittest.TestCase):
response = json.loads(request.data.decode('utf-8')) response = json.loads(request.data.decode('utf-8'))
self.assertEqual('File not found', response['message']) self.assertEqual('File not found', response['message'])
@patch('file_removal_scheduler.random.randint')
def test_api_upload_leftover(self, randint_mock):
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '0'
app = main.create_app()
self.upload_folder = tempfile.mkdtemp()
app.config.update(
TESTING=True,
UPLOAD_FOLDER=self.upload_folder
)
app = app.test_client()
randint_mock.return_value = 1
self.upload_download_test_jpg_and_assert_response_code(app, 200)
randint_mock.return_value = 0
self.upload_download_test_jpg_and_assert_response_code(app, 404)
os.environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '9999'
def upload_download_test_jpg_and_assert_response_code(self, app, code):
request = app.post('/api/upload',
data='{"file_name": "test_name.jpg", '
'"file": "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAf'
'FcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="}',
headers={'content-type': 'application/json'}
)
download_link = json.loads(request.data.decode('utf-8'))['download_link']
request = app.get(download_link)
self.assertEqual(code, request.status_code)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -0,0 +1,48 @@
import unittest
import tempfile
from os import path, environ
import shutil
import file_removal_scheduler
import main
class Mat2WebTestCase(unittest.TestCase):
def setUp(self):
self.upload_folder = tempfile.mkdtemp()
app = main.create_app()
app.config.update(
TESTING=True,
UPLOAD_FOLDER=self.upload_folder
)
self.app = app
def test_removal(self):
filename = 'test_name.cleaned.jpg'
environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '0'
open(path.join(self.upload_folder, filename), 'a').close()
self.assertTrue(path.exists(path.join(self.upload_folder, )))
for i in range(0, 11):
file_removal_scheduler.run_file_removal_job(self.app.config['UPLOAD_FOLDER'])
self.assertFalse(path.exists(path.join(self.upload_folder, filename)))
open(path.join(self.upload_folder, filename), 'a').close()
file_removal_scheduler.run_file_removal_job(self.app.config['UPLOAD_FOLDER'])
self.assertTrue(path.exists(path.join(self.upload_folder, )))
def test_non_removal(self):
filename = u'i_should_no_be_removed.txt'
environ['MAT2_MAX_FILE_AGE_FOR_REMOVAL'] = '9999999'
open(path.join(self.upload_folder, filename), 'a').close()
self.assertTrue(path.exists(path.join(self.upload_folder, filename)))
for i in range(0, 11):
file_removal_scheduler.run_file_removal_job(self.app.config['UPLOAD_FOLDER'])
self.assertTrue(path.exists(path.join(self.upload_folder, filename)))
def tearDown(self):
shutil.rmtree(self.upload_folder)
if __name__ == '__main__':
unittest.main()