mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: simpleclicks - only click if element is visible, fixes spinning on moma.org sites implement brozzler-list-pages new cli utils brozzler-list-jobs and brozzler-list-sites
This commit is contained in:
commit
77b34bd5b2
148
brozzler/cli.py
148
brozzler/cli.py
@ -36,6 +36,7 @@ import traceback
|
||||
import warnings
|
||||
import yaml
|
||||
import shutil
|
||||
import base64
|
||||
|
||||
def _add_common_options(arg_parser):
|
||||
arg_parser.add_argument(
|
||||
@ -353,6 +354,143 @@ def brozzler_ensure_tables():
|
||||
# sites, pages, jobs tables
|
||||
brozzler.frontier.RethinkDbFrontier(r)
|
||||
|
||||
class Jsonner(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime.datetime):
|
||||
return o.isoformat()
|
||||
elif isinstance(o, bytes):
|
||||
return base64.b64encode(o).decode('ascii')
|
||||
else:
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
def brozzler_list_jobs():
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument(
|
||||
'-a', '--all', dest='all', action='store_true', help=(
|
||||
'list all jobs (by default, only active jobs are listed)'))
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
reql = r.table('jobs').order_by('id')
|
||||
if not args.all:
|
||||
reql = reql.filter({'status': 'ACTIVE'})
|
||||
logging.debug('querying rethinkdb: %s', reql)
|
||||
results = reql.run()
|
||||
for result in results:
|
||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||
|
||||
def brozzler_list_sites():
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument(
|
||||
'-a', '--all', dest='all', action='store_true', help=(
|
||||
'list all sites (by default, only active sites are listed)'))
|
||||
group = arg_parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--jobless', dest='jobless', action='store_true', help=(
|
||||
'list only jobless sites'))
|
||||
group.add_argument(
|
||||
'--job', dest='job', metavar='JOB_ID', help=(
|
||||
'list only sites for the supplied job'))
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
|
||||
reql = r.table('sites')
|
||||
if args.job:
|
||||
try:
|
||||
job_id = int(args.job)
|
||||
except ValueError:
|
||||
job_id = args.job
|
||||
reql = reql.get_all(job_id, index='job_id')
|
||||
elif args.jobless:
|
||||
reql = reql.filter(~r.row.has_fields('job_id'))
|
||||
if not args.all:
|
||||
reql = reql.filter({'status': 'ACTIVE'})
|
||||
logging.debug('querying rethinkdb: %s', reql)
|
||||
results = reql.run()
|
||||
for result in results:
|
||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||
|
||||
def brozzler_list_pages():
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
group = arg_parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
'--job', dest='job', metavar='JOB_ID', help=(
|
||||
'list pages for all sites of the supplied job'))
|
||||
group.add_argument(
|
||||
'--site', dest='site', metavar='SITE', help=(
|
||||
'list pages of the supplied site'))
|
||||
group = arg_parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--queued', dest='queued', action='store_true', help=(
|
||||
'limit only queued pages'))
|
||||
group.add_argument(
|
||||
'--brozzled', dest='brozzled', action='store_true', help=(
|
||||
'limit only pages that have already been brozzled'))
|
||||
group.add_argument(
|
||||
'--claimed', dest='claimed', action='store_true', help=(
|
||||
'limit only pages that are currently claimed by a brozzler '
|
||||
'worker'))
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
if args.job:
|
||||
try:
|
||||
job_id = int(args.job)
|
||||
except ValueError:
|
||||
job_id = args.job
|
||||
reql = r.table('sites').get_all(job_id, index='job_id')['id']
|
||||
logging.debug('querying rethinkb: %s', reql)
|
||||
site_ids = reql.run()
|
||||
else:
|
||||
try:
|
||||
site_ids = [int(args.site)]
|
||||
except ValueError:
|
||||
site_ids = [args.site]
|
||||
|
||||
for site_id in site_ids:
|
||||
reql = r.table('pages')
|
||||
if args.queued:
|
||||
reql = reql.between(
|
||||
[site_id, 0, r.minval], [site_id, 0, r.maxval],
|
||||
index='least_hops')
|
||||
elif args.brozzled:
|
||||
reql = reql.between(
|
||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||
index='least_hops')
|
||||
else:
|
||||
reql = reql.between(
|
||||
[site_id, 0, r.minval], [site_id, r.maxval, r.maxval],
|
||||
index='least_hops')
|
||||
reql = reql.order_by(index="least_hops")
|
||||
if args.claimed:
|
||||
reql = reql.filter({'claimed': True})
|
||||
logging.debug('querying rethinkb: %s', reql)
|
||||
results = reql.run()
|
||||
for result in results:
|
||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||
|
||||
def brozzler_list_captures():
|
||||
'''
|
||||
Handy utility for looking up entries in the rethinkdb "captures" table by
|
||||
@ -381,12 +519,6 @@ def brozzler_list_captures():
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
|
||||
class Jsonner(json.JSONEncoder):
|
||||
def default(self, o):
|
||||
if isinstance(o, datetime.datetime):
|
||||
return o.isoformat()
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
if args.url_or_sha1[:5] == 'sha1:':
|
||||
if args.prefix:
|
||||
logging.warn(
|
||||
@ -398,7 +530,7 @@ def brozzler_list_captures():
|
||||
[sha1base32, rethinkdb.minval, rethinkdb.minval],
|
||||
[sha1base32, rethinkdb.maxval, rethinkdb.maxval],
|
||||
index='sha1_warc_type')
|
||||
logging.debug('rethinkdb query: %s', reql)
|
||||
logging.debug('querying rethinkdb: %s', reql)
|
||||
results = reql.run()
|
||||
for result in results:
|
||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||
@ -422,7 +554,7 @@ def brozzler_list_captures():
|
||||
reql = reql.filter(
|
||||
lambda capture: (capture['canon_surt'] >= key)
|
||||
& (capture['canon_surt'] <= end_key))
|
||||
logging.debug('rethinkdb query: %s', reql)
|
||||
logging.debug('querying rethinkdb: %s', reql)
|
||||
results = reql.run()
|
||||
for result in results:
|
||||
print(json.dumps(result, cls=Jsonner, indent=2))
|
||||
|
@ -22,6 +22,13 @@ var umbraBehavior = {
|
||||
idleSince : null,
|
||||
alreadyClicked : {},
|
||||
|
||||
// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
|
||||
// n.b. returns true for elements with visibility:hidden, which occupy
|
||||
// screen real estate but are not visible, or clickable with the ui
|
||||
isVisible : function(elem) {
|
||||
return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
|
||||
},
|
||||
|
||||
intervalFunc : function() {
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
@ -38,10 +45,11 @@ var umbraBehavior = {
|
||||
}
|
||||
|
||||
for (var j = 0; j < documents.length; j++) {
|
||||
|
||||
var clickTargets = documents[j].querySelectorAll(cssSelector);
|
||||
|
||||
for ( var i = 0; i < clickTargets.length; i++) {
|
||||
if (!this.isVisible(clickTargets[i])) {
|
||||
continue;
|
||||
}
|
||||
if (clickTargets[i].umbraClicked && !clickUntilTimeout) {
|
||||
continue;
|
||||
}
|
||||
@ -78,12 +86,12 @@ var umbraBehavior = {
|
||||
this.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (this.idleSince == null) {
|
||||
|
5
setup.py
5
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev162',
|
||||
version='1.1b9.dev165',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -53,6 +53,9 @@ setuptools.setup(
|
||||
'brozzler-worker=brozzler.cli:brozzler_worker',
|
||||
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
||||
'brozzler-list-captures=brozzler.cli:brozzler_list_captures',
|
||||
'brozzler-list-jobs=brozzler.cli:brozzler_list_jobs',
|
||||
'brozzler-list-sites=brozzler.cli:brozzler_list_sites',
|
||||
'brozzler-list-pages=brozzler.cli:brozzler_list_pages',
|
||||
'brozzler-dashboard=brozzler.dashboard:main',
|
||||
'brozzler-easy=brozzler.easy:main',
|
||||
'brozzler-wayback=brozzler.pywb:main',
|
||||
|
Loading…
x
Reference in New Issue
Block a user