From 174178e02ef60675119b371c683c5c692e0b4d60 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 25 Sep 2018 14:56:26 -0700 Subject: [PATCH 1/3] new command brozzler-purge --- brozzler/cli.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 2 files changed, 59 insertions(+) diff --git a/brozzler/cli.py b/brozzler/cli.py index d843fe9..bde438e 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -588,6 +588,64 @@ def brozzler_list_pages(argv=None): for result in results: print(json.dumps(result, cls=Jsonner, indent=2)) +def brozzler_purge(argv=None): + argv = argv or sys.argv + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(argv[0]), + formatter_class=BetterArgumentDefaultsHelpFormatter) + group = arg_parser.add_mutually_exclusive_group(required=True) + group.add_argument( + '--job', dest='job', metavar='JOB_ID', help=( + 'delete crawl state from rethinkdb for a job, including all ' + 'sites and pages')) + group.add_argument( + '--site', dest='site', metavar='SITE_ID', help=( + 'delete crawl state from rethinkdb for a site, including all ' + 'pages')) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser, argv) + + args = arg_parser.parse_args(args=argv[1:]) + configure_logging(args) + + rr = rethinker(args) + import pdb; pdb.set_trace() + if args.job: + try: + job_id = int(args.job) + except ValueError: + job_id = args.job + _purge_job(rr, job_id) + elif args.site: + site_id = args.site + _purge_site(rr, site_id) + +def _purge_site(rr, site_id): + reql = rr.table('pages').between( + [site_id, r.minval, r.minval], + [site_id, r.maxval, r.maxval], + index='priority_by_site').delete() + logging.debug('deleting pages for site %s: %s', site_id, reql) + result = reql.run() + logging.info('deleted pages for site %s: %s', site_id, result) + + reql = rr.table('sites').get(site_id).delete() + logging.debug('deleting site %s: %s', site_id, reql) + result = reql.run() + logging.info('deleted site %s: %s', site_id, result) + +def _purge_job(rr, job_id): + reql = rr.table('sites').get_all(job_id, index='job_id').get_field('id') + logging.debug('querying rethinkdb: %s', reql) + site_ids = list(reql.run()) + for site_id in site_ids: + _purge_site(rr, site_id) + + reql = rr.table('jobs').get(job_id).delete() + logging.debug('deleting job %s: %s', job_id, reql) + result = reql.run() + logging.info('deleted job %s: %s', job_id, result) + def brozzler_list_captures(argv=None): ''' Handy utility for looking up entries in the rethinkdb "captures" table by diff --git a/setup.py b/setup.py index 6c5b897..a74dc07 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ setuptools.setup( 'brozzler-list-sites=brozzler.cli:brozzler_list_sites', 'brozzler-list-pages=brozzler.cli:brozzler_list_pages', 'brozzler-stop-crawl=brozzler.cli:brozzler_stop_crawl', + 'brozzler-purge=brozzler.cli:brozzler_purge', 'brozzler-dashboard=brozzler.dashboard:main', 'brozzler-easy=brozzler.easy:main', 'brozzler-wayback=brozzler.pywb:main', From 560981c1ad2da46e5abd3c768cbef3dda8f3498c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 25 Sep 2018 15:17:45 -0700 Subject: [PATCH 2/3] safety check and --force for brozzler-purge --- brozzler/cli.py | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index bde438e..b5791cf 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -602,6 +602,9 @@ def brozzler_purge(argv=None): '--site', dest='site', metavar='SITE_ID', help=( 'delete crawl state from rethinkdb for a site, including all ' 'pages')) + arg_parser.add_argument( + '--force', dest='force', action='store_true', help=( + 'purge even if job or site is still has status ACTIVE')) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) @@ -609,15 +612,43 @@ def brozzler_purge(argv=None): configure_logging(args) rr = rethinker(args) - import pdb; pdb.set_trace() + frontier = brozzler.RethinkDbFrontier(rr) if args.job: try: job_id = int(args.job) except ValueError: job_id = args.job + job = brozzler.Job.load(rr, job_id) + if not job: + logging.fatal('no such job %r', job_id) + sys.exit(1) + if job.status == 'ACTIVE': + if args.force: + logging.warn( + 'job %s has status ACTIVE, purging anyway because ' + '--force was supplied', job_id) + else: + logging.fatal( + 'refusing to purge job %s because status is ACTIVE ' + '(override with --force)', job_id) + sys.exit(1) _purge_job(rr, job_id) elif args.site: site_id = args.site + site = brozzler.Site.load(rr, site_id) + if not site: + logging.fatal('no such job %r', job_id) + sys.exit(1) + if site.status == 'ACTIVE': + if args.force: + logging.warn( + 'site %s has status ACTIVE, purging anyway because ' + '--force was supplied', site_id) + else: + logging.fatal( + 'refusing to purge site %s because status is ACTIVE ' + '(override with --force)', site_id) + sys.exit(1) _purge_site(rr, site_id) def _purge_site(rr, site_id): @@ -625,14 +656,14 @@ def _purge_site(rr, site_id): [site_id, r.minval, r.minval], [site_id, r.maxval, r.maxval], index='priority_by_site').delete() - logging.debug('deleting pages for site %s: %s', site_id, reql) + logging.debug('purging pages for site %s: %s', site_id, reql) result = reql.run() - logging.info('deleted pages for site %s: %s', site_id, result) + logging.info('purged pages for site %s: %s', site_id, result) reql = rr.table('sites').get(site_id).delete() - logging.debug('deleting site %s: %s', site_id, reql) + logging.debug('purging site %s: %s', site_id, reql) result = reql.run() - logging.info('deleted site %s: %s', site_id, result) + logging.info('purged site %s: %s', site_id, result) def _purge_job(rr, job_id): reql = rr.table('sites').get_all(job_id, index='job_id').get_field('id') @@ -642,9 +673,9 @@ def _purge_job(rr, job_id): _purge_site(rr, site_id) reql = rr.table('jobs').get(job_id).delete() - logging.debug('deleting job %s: %s', job_id, reql) + logging.debug('purging job %s: %s', job_id, reql) result = reql.run() - logging.info('deleted job %s: %s', job_id, result) + logging.info('purged job %s: %s', job_id, result) def brozzler_list_captures(argv=None): ''' From f4fad934a79c5e792b7c66d8601b2c540f606740 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 25 Sep 2018 15:19:33 -0700 Subject: [PATCH 3/3] verbiage tweaks --- brozzler/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index b5791cf..85f0fc4 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -592,15 +592,16 @@ def brozzler_purge(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), + description='brozzler-purge - purge crawl state from rethinkdb', formatter_class=BetterArgumentDefaultsHelpFormatter) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument( '--job', dest='job', metavar='JOB_ID', help=( - 'delete crawl state from rethinkdb for a job, including all ' + 'purge crawl state from rethinkdb for a job, including all ' 'sites and pages')) group.add_argument( '--site', dest='site', metavar='SITE_ID', help=( - 'delete crawl state from rethinkdb for a site, including all ' + 'purge crawl state from rethinkdb for a site, including all ' 'pages')) arg_parser.add_argument( '--force', dest='force', action='store_true', help=(