diff --git a/brozzler/cli.py b/brozzler/cli.py index a1e0621..26efaa9 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -36,6 +36,7 @@ import traceback import warnings import yaml import shutil +import base64 def _add_common_options(arg_parser): arg_parser.add_argument( @@ -353,6 +354,143 @@ def brozzler_ensure_tables(): # sites, pages, jobs tables brozzler.frontier.RethinkDbFrontier(r) +class Jsonner(json.JSONEncoder): + def default(self, o): + if isinstance(o, datetime.datetime): + return o.isoformat() + elif isinstance(o, bytes): + return base64.b64encode(o).decode('ascii') + else: + return json.JSONEncoder.default(self, o) + +def brozzler_list_jobs(): + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument( + '-a', '--all', dest='all', action='store_true', help=( + 'list all jobs (by default, only active jobs are listed)')) + _add_rethinkdb_options(arg_parser) + _add_common_options(arg_parser) + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) + reql = r.table('jobs').order_by('id') + if not args.all: + reql = reql.filter({'status': 'ACTIVE'}) + logging.debug('querying rethinkdb: %s', reql) + results = reql.run() + for result in results: + print(json.dumps(result, cls=Jsonner, indent=2)) + +def brozzler_list_sites(): + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument( + '-a', '--all', dest='all', action='store_true', help=( + 'list all sites (by default, only active sites are listed)')) + group = arg_parser.add_mutually_exclusive_group() + group.add_argument( + '--jobless', dest='jobless', action='store_true', help=( + 'list only jobless sites')) + group.add_argument( + '--job', dest='job', metavar='JOB_ID', help=( + 'list only sites for the supplied job')) + _add_rethinkdb_options(arg_parser) + _add_common_options(arg_parser) + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) + + reql = r.table('sites') + if args.job: + try: + job_id = int(args.job) + except ValueError: + job_id = args.job + reql = reql.get_all(job_id, index='job_id') + elif args.jobless: + reql = reql.filter(~r.row.has_fields('job_id')) + if not args.all: + reql = reql.filter({'status': 'ACTIVE'}) + logging.debug('querying rethinkdb: %s', reql) + results = reql.run() + for result in results: + print(json.dumps(result, cls=Jsonner, indent=2)) + +def brozzler_list_pages(): + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + group = arg_parser.add_mutually_exclusive_group(required=True) + group.add_argument( + '--job', dest='job', metavar='JOB_ID', help=( + 'list pages for all sites of the supplied job')) + group.add_argument( + '--site', dest='site', metavar='SITE', help=( + 'list pages of the supplied site')) + group = arg_parser.add_mutually_exclusive_group() + group.add_argument( + '--queued', dest='queued', action='store_true', help=( + 'limit only queued pages')) + group.add_argument( + '--brozzled', dest='brozzled', action='store_true', help=( + 'limit only pages that have already been brozzled')) + group.add_argument( + '--claimed', dest='claimed', action='store_true', help=( + 'limit only pages that are currently claimed by a brozzler ' + 'worker')) + _add_rethinkdb_options(arg_parser) + _add_common_options(arg_parser) + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) + if args.job: + try: + job_id = int(args.job) + except ValueError: + job_id = args.job + reql = r.table('sites').get_all(job_id, index='job_id')['id'] + logging.debug('querying rethinkb: %s', reql) + site_ids = reql.run() + else: + try: + site_ids = [int(args.site)] + except ValueError: + site_ids = [args.site] + + for site_id in site_ids: + reql = r.table('pages') + if args.queued: + reql = reql.between( + [site_id, 0, r.minval], [site_id, 0, r.maxval], + index='least_hops') + elif args.brozzled: + reql = reql.between( + [site_id, 1, r.minval], [site_id, r.maxval, r.maxval], + index='least_hops') + else: + reql = reql.between( + [site_id, 0, r.minval], [site_id, r.maxval, r.maxval], + index='least_hops') + reql = reql.order_by(index="least_hops") + if args.claimed: + reql = reql.filter({'claimed': True}) + logging.debug('querying rethinkb: %s', reql) + results = reql.run() + for result in results: + print(json.dumps(result, cls=Jsonner, indent=2)) + def brozzler_list_captures(): ''' Handy utility for looking up entries in the rethinkdb "captures" table by @@ -381,12 +519,6 @@ def brozzler_list_captures(): r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) - class Jsonner(json.JSONEncoder): - def default(self, o): - if isinstance(o, datetime.datetime): - return o.isoformat() - return json.JSONEncoder.default(self, o) - if args.url_or_sha1[:5] == 'sha1:': if args.prefix: logging.warn( @@ -398,7 +530,7 @@ def brozzler_list_captures(): [sha1base32, rethinkdb.minval, rethinkdb.minval], [sha1base32, rethinkdb.maxval, rethinkdb.maxval], index='sha1_warc_type') - logging.debug('rethinkdb query: %s', reql) + logging.debug('querying rethinkdb: %s', reql) results = reql.run() for result in results: print(json.dumps(result, cls=Jsonner, indent=2)) @@ -422,7 +554,7 @@ def brozzler_list_captures(): reql = reql.filter( lambda capture: (capture['canon_surt'] >= key) & (capture['canon_surt'] <= end_key)) - logging.debug('rethinkdb query: %s', reql) + logging.debug('querying rethinkdb: %s', reql) results = reql.run() for result in results: print(json.dumps(result, cls=Jsonner, indent=2)) diff --git a/brozzler/js-templates/simpleclicks.js.j2 b/brozzler/js-templates/simpleclicks.js.j2 index 95f0008..14652b8 100644 --- a/brozzler/js-templates/simpleclicks.js.j2 +++ b/brozzler/js-templates/simpleclicks.js.j2 @@ -22,6 +22,13 @@ var umbraBehavior = { idleSince : null, alreadyClicked : {}, + // https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js + // n.b. returns true for elements with visibility:hidden, which occupy + // screen real estate but are not visible, or clickable with the ui + isVisible : function(elem) { + return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length); + }, + intervalFunc : function() { var clickedSomething = false; var somethingLeftBelow = false; @@ -38,10 +45,11 @@ var umbraBehavior = { } for (var j = 0; j < documents.length; j++) { - var clickTargets = documents[j].querySelectorAll(cssSelector); - for ( var i = 0; i < clickTargets.length; i++) { + if (!this.isVisible(clickTargets[i])) { + continue; + } if (clickTargets[i].umbraClicked && !clickUntilTimeout) { continue; } @@ -78,12 +86,12 @@ var umbraBehavior = { this.idleSince = null; } else if (somethingLeftBelow) { // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" - // + document.body.clientHeight); + // + document.body.clientHeight); window.scrollBy(0, 200); this.idleSince = null; } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { // console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" - // + document.body.clientHeight); + // + document.body.clientHeight); window.scrollBy(0, 200); this.idleSince = null; } else if (this.idleSince == null) { diff --git a/setup.py b/setup.py index 9106b5c..4601475 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev162', + version='1.1b9.dev165', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -53,6 +53,9 @@ setuptools.setup( 'brozzler-worker=brozzler.cli:brozzler_worker', 'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', 'brozzler-list-captures=brozzler.cli:brozzler_list_captures', + 'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs', + 'brozzler-list-sites=brozzler.cli:brozzler_list_sites', + 'brozzler-list-pages=brozzler.cli:brozzler_list_pages', 'brozzler-dashboard=brozzler.dashboard:main', 'brozzler-easy=brozzler.easy:main', 'brozzler-wayback=brozzler.pywb:main',