Merge pull request #150 from nlevitt/purge-old

Purge old
This commit is contained in:
Noah Levitt 2019-05-16 00:29:58 -07:00 committed by GitHub
commit 42ddfba923
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 23 additions and 5 deletions

View file

@ -2,7 +2,7 @@
''' '''
brozzler/cli.py - brozzler command line executables brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2017 Internet Archive Copyright (C) 2014-2019 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -605,6 +605,10 @@ def brozzler_purge(argv=None):
'--site', dest='site', metavar='SITE_ID', help=( '--site', dest='site', metavar='SITE_ID', help=(
'purge crawl state from rethinkdb for a site, including all ' 'purge crawl state from rethinkdb for a site, including all '
'pages')) 'pages'))
group.add_argument(
'--finished-before', dest='finished_before', metavar='YYYY-MM-DD',
help=('purge crawl state from rethinkdb for a jobs that ended '
'before this date'))
arg_parser.add_argument( arg_parser.add_argument(
'--force', dest='force', action='store_true', help=( '--force', dest='force', action='store_true', help=(
'purge even if job or site is still has status ACTIVE')) 'purge even if job or site is still has status ACTIVE'))
@ -653,6 +657,20 @@ def brozzler_purge(argv=None):
'(override with --force)', site_id) '(override with --force)', site_id)
sys.exit(1) sys.exit(1)
_purge_site(rr, site_id) _purge_site(rr, site_id)
elif args.finished_before:
finished_before = datetime.datetime.strptime(
args.finished_before, '%Y-%m-%d').replace(
tzinfo=doublethink.UTC)
reql = rr.table('jobs').filter(
r.row['finished'].default(r.maxval).lt(finished_before).or_(
r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before)))
logging.debug(
'retrieving jobs older than %s: %s', finished_before, reql)
for job in reql.run():
# logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s',
# job['id'], job.get('finished'),
# job.get('starts_and_stops', [{'stop':None}])[-1]['stop'])
_purge_job(rr, job['id'])
def _purge_site(rr, site_id): def _purge_site(rr, site_id):
reql = rr.table('pages').between( reql = rr.table('pages').between(

View file

@ -64,10 +64,10 @@ setuptools.setup(
], ],
}, },
install_requires=[ install_requires=[
'PyYAML>=3.12', 'PyYAML>=5.8',
'youtube-dl>=2018.7.21', 'youtube-dl>=2018.7.21',
'reppy==0.3.4', 'reppy==0.3.4',
'requests>=2.18.4', 'requests>=2.21',
'websocket-client>=0.39.0,<=0.48.0', 'websocket-client>=0.39.0,<=0.48.0',
'pillow>=5.2.0', 'pillow>=5.2.0',
'urlcanon>=0.1.dev23', 'urlcanon>=0.1.dev23',
@ -80,13 +80,13 @@ setuptools.setup(
], ],
extras_require={ extras_require={
'dashboard': [ 'dashboard': [
'flask>=0.11', 'flask>=1.0',
'gunicorn>=19.8.1' 'gunicorn>=19.8.1'
], ],
'easy': [ 'easy': [
'warcprox>=2.4b2.dev173', 'warcprox>=2.4b2.dev173',
'pywb>=0.33.2,<2', 'pywb>=0.33.2,<2',
'flask>=0.11', 'flask>=1.0',
'gunicorn>=19.8.1' 'gunicorn>=19.8.1'
], ],
}, },