Merge branch 'qa' of github.com:internetarchive/brozzler into qa

This commit is contained in:
Barbara Miller 2017-01-30 20:52:58 -08:00
commit 6bf8cfe893
16 changed files with 297 additions and 184 deletions

View File

@ -17,3 +17,5 @@ after_failure:
notifications:
slack:
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=
secure: jopAXO8j3AkNWhF02GIzlkHJmqcCfrDEDPHcLHwxGB1vKrJqfMtcmV1+JXv7jGPwT8hBkkZItD1fTbsA1UMTtZCsadhqwrH9sh/BtJy4mf1jDDK0Hq4bPdbpB/mHKBfjD+ZedPZphCiwRQm94QdMOAsmCsj1BluFn+ySHuNAnwyXCNohut5a3aFBszOwBNgZMwBmu+weAUpMrDbr/dhqOtU0IaNvhTJ2Ykyex7Of86L05lBI8MiGtq/J73uDiDINWViBXqG5+/LKIVLvnjzCxZOnOVtSVorRNY0OsClfLJILuWOXk0/C3p+lBCyq5iatWweNqcqqpMifUSdVp4x8GnPyvl4O5YuIZW674mpGmH6UW10MqEnqxFQIcZpArir/zToK/cIKsUse20n8U5LUgOSWeNM1RIBvc4ckeDuthjwvyfmP0hrnNxrPFxRez2J2r6alWFABvD0H83a3hn56AtGXqV+9gt9d4J0+vnBJkXMidQaORBnyRkPlTROxqkoK8r0PME8xr6GwDWHpUN7/Ibo9gS/zpA7zpJUIsAsevVKOSaITZwKqbCMTI3uy/tJcnzRUrnq5wqhh8vXlWzIxEvTW8vuIapjSvDzhnJga85bIEmoauyMd13gR/vhqXQ3xUdN5LeyXAPn24b5e2GNSrhDOaAs30tXe+Z31njSeKPM=

View File

@ -157,7 +157,10 @@ class WebsockReceiverThread(threading.Thread):
brozzler.thread_raise(self.calling_thread, BrowsingException)
def run(self):
self.websock.run_forever()
# ping_timeout is used as the timeout for the call to select.select()
# in addition to its documented purpose, and must have a value to avoid
# hangs in certain situations
self.websock.run_forever(ping_timeout=0.5)
def _on_message(self, websock, message):
try:
@ -202,6 +205,17 @@ class WebsockReceiverThread(threading.Thread):
if self.on_response:
self.on_response(message)
def _javascript_dialog_opening(self, message):
self.logger.info('javascript dialog opened: %s', message)
if message['params']['type'] == 'alert':
accept = True
else:
accept = False
self.websock.send(
json.dumps(dict(
id=0, method='Page.handleJavaScriptDialog',
params={'accept': accept})))
def _handle_message(self, websock, json_message):
message = json.loads(json_message)
if 'method' in message:
@ -223,6 +237,8 @@ class WebsockReceiverThread(threading.Thread):
'%s console.%s %s', self.websock.url,
message['params']['message']['level'],
message['params']['message']['text'])
elif message['method'] == 'Page.javascriptDialogOpening':
self._javascript_dialog_opening(message)
# else:
# self.logger.debug("%s %s", message["method"], json_message)
elif 'result' in message:
@ -540,6 +556,7 @@ class Browser:
timeout=5)
msg = self.websock_thread.pop_result(msg_id)
if (msg and 'result' in msg
and not ('exceptionDetails' in msg['result'])
and not ('wasThrown' in msg['result']
and msg['result']['wasThrown'])
and 'result' in msg['result']

View File

@ -177,6 +177,7 @@ class Chrome:
json_url = 'http://localhost:%s/json' % self.port
# make this a member variable so that kill -QUIT reports it
self._start = time.time()
self._last_warning = self._start
while True:
try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read()
@ -194,11 +195,11 @@ class Chrome:
except brozzler.ShutdownRequested:
raise
except BaseException as e:
if int(time.time() - self._start) % 10 == 5:
if time.time() - self._last_warning > 30:
self.logger.warn(
'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e)
pass
self._last_warning = time.time()
finally:
if time.time() - self._start > timeout_sec:
self.logger.error(

View File

@ -2,7 +2,7 @@
'''
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -38,16 +38,19 @@ import yaml
import shutil
import base64
def _add_common_options(arg_parser):
def add_common_options(arg_parser):
arg_parser.add_argument(
'-q', '--quiet', dest='log_level',
action='store_const', default=logging.INFO, const=logging.WARN)
'-q', '--quiet', dest='log_level', action='store_const',
default=logging.INFO, const=logging.WARN, help=(
'quiet logging, only warnings and errors'))
arg_parser.add_argument(
'-v', '--verbose', dest='log_level',
action='store_const', default=logging.INFO, const=logging.DEBUG)
'-v', '--verbose', dest='log_level', action='store_const',
default=logging.INFO, const=logging.DEBUG, help=(
'verbose logging'))
arg_parser.add_argument(
'--trace', dest='log_level',
action='store_const', default=logging.INFO, const=brozzler.TRACE)
'--trace', dest='log_level', action='store_const',
default=logging.INFO, const=brozzler.TRACE, help=(
'very verbose logging'))
# arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const',
# default=logging.INFO, const=logging.CRITICAL)
@ -56,15 +59,26 @@ def _add_common_options(arg_parser):
version='brozzler %s - %s' % (
brozzler.__version__, os.path.basename(sys.argv[0])))
def _add_rethinkdb_options(arg_parser):
def add_rethinkdb_options(arg_parser):
arg_parser.add_argument(
'--rethinkdb-servers', dest='rethinkdb_servers',
default='localhost', help=(
default=os.environ.get('BROZZLER_RETHINKDB_SERVERS', 'localhost'),
help=(
'rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org'))
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the '
'value of environment variable BROZZLER_RETHINKDB_SERVERS)'))
arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler',
help='rethinkdb database name')
'--rethinkdb-db', dest='rethinkdb_db',
default=os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
help=(
'rethinkdb database name (default is the value of environment '
'variable BROZZLER_RETHINKDB_DB)'))
def rethinker(args):
servers = args.rethinkdb_servers or 'localhost'
db = args.rethinkdb_db or os.environ.get(
'BROZZLER_RETHINKDB_DB') or 'brozzler'
return rethinkstuff.Rethinker(servers.split(','), db)
def _add_proxy_options(arg_parser):
arg_parser.add_argument(
@ -75,7 +89,7 @@ def _add_proxy_options(arg_parser):
'enable special features that assume the configured proxy is '
'warcprox'))
def _configure_logging(args):
def configure_logging(args):
logging.basicConfig(
stream=sys.stderr, level=args.log_level,
format=(
@ -107,6 +121,18 @@ def suggest_default_chrome_exe():
return exe
return 'chromium-browser'
class BetterArgumentDefaultsHelpFormatter(
argparse.ArgumentDefaultsHelpFormatter):
'''
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
for arguments with action='store_const'.
'''
def _get_help_string(self, action):
if isinstance(action, argparse._StoreConstAction):
return action.help
else:
return super()._get_help_string(action)
def brozzle_page():
'''
Command line utility entry point for brozzling a single page. Opens url in
@ -115,7 +141,7 @@ def brozzle_page():
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
description='brozzle-page - brozzle a single page',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument('url', metavar='URL', help='page url')
arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe',
@ -141,10 +167,10 @@ def brozzle_page():
action='store_true', help=(
'enable special features that assume the configured proxy '
'is warcprox'))
_add_common_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
behavior_parameters = {}
if args.behavior_parameters:
@ -191,14 +217,13 @@ def brozzler_new_job():
arg_parser.add_argument(
'job_conf_file', metavar='JOB_CONF_FILE',
help='brozzler job configuration file in yaml')
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
r = rethinker(args)
frontier = brozzler.RethinkDbFrontier(r)
try:
brozzler.job.new_job_file(frontier, args.job_conf_file)
@ -218,7 +243,7 @@ def brozzler_new_site():
description='brozzler-new-site - register site to brozzle',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
_add_rethinkdb_options(arg_parser)
add_rethinkdb_options(arg_parser)
_add_proxy_options(arg_parser)
arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None,
@ -244,10 +269,10 @@ def brozzler_new_site():
arg_parser.add_argument(
'--password', dest='password', default=None,
help='use this password to try to log in if a login form is found')
_add_common_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
site = brozzler.Site(
seed=args.seed, proxy=args.proxy,
@ -260,8 +285,7 @@ def brozzler_new_site():
args.behavior_parameters) if args.behavior_parameters else None,
username=args.username, password=args.password)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)
r = rethinker()
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
@ -273,7 +297,7 @@ def brozzler_worker():
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser)
add_rethinkdb_options(arg_parser)
arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe',
default=suggest_default_chrome_exe(),
@ -281,10 +305,10 @@ def brozzler_worker():
arg_parser.add_argument(
'-n', '--max-browsers', dest='max_browsers', default='1',
help='max number of chrome instances simultaneously browsing pages')
_add_common_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
def sigterm(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
@ -316,8 +340,7 @@ def brozzler_worker():
signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
r = rethinker(args)
frontier = brozzler.RethinkDbFrontier(r)
service_registry = rethinkstuff.ServiceRegistry(r)
worker = brozzler.worker.BrozzlerWorker(
@ -339,14 +362,13 @@ def brozzler_ensure_tables():
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
r = rethinker(args)
# services table
rethinkstuff.ServiceRegistry(r)
@ -370,14 +392,13 @@ def brozzler_list_jobs():
arg_parser.add_argument(
'-a', '--all', dest='all', action='store_true', help=(
'list all jobs (by default, only active jobs are listed)'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
r = rethinker(args)
reql = r.table('jobs').order_by('id')
if not args.all:
reql = reql.filter({'status': 'ACTIVE'})
@ -400,14 +421,13 @@ def brozzler_list_sites():
group.add_argument(
'--job', dest='job', metavar='JOB_ID', help=(
'list only sites for the supplied job'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
r = rethinker(args)
reql = r.table('sites')
if args.job:
@ -447,14 +467,13 @@ def brozzler_list_pages():
'--claimed', dest='claimed', action='store_true', help=(
'limit only pages that are currently claimed by a brozzler '
'worker'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
r = rethinker(args)
if args.job:
try:
job_id = int(args.job)
@ -507,17 +526,16 @@ def brozzler_list_captures():
'use prefix match for url (n.b. may not work as expected if '
'searching key has query string because canonicalization can '
'reorder query parameters)'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
arg_parser.add_argument(
'url_or_sha1', metavar='URL_or_SHA1',
help='url or sha1 to look up in captures table')
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
r = rethinker(args)
if args.url_or_sha1[:5] == 'sha1:':
if args.prefix:

View File

@ -2,7 +2,7 @@
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
endspoints etc
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -35,27 +35,15 @@ import rethinkdb
import yaml
import base64
# flask does its own logging config
# logging.basicConfig(
# stream=sys.stdout, level=logging.INFO,
# format=(
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
app = flask.Flask(__name__)
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
gunicorn_error_logger = logging.getLogger('gunicorn.error')
app.logger.handlers.extend(gunicorn_error_logger.handlers)
app.logger.setLevel(logging.INFO)
# configure with environment variables
SETTINGS = {
'RETHINKDB_SERVERS': os.environ.get(
'RETHINKDB_SERVERS', 'localhost').split(','),
'RETHINKDB_DB': os.environ.get('RETHINKDB_DB', 'brozzler'),
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
'WAYBACK_BASEURL': os.environ.get(
'WAYBACK_BASEURL', 'http://localhost:8091/brozzler'),
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
}
r = rethinkstuff.Rethinker(
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
@ -69,20 +57,24 @@ def service_registry():
@app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count")
def queued_count(site_id):
count = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site").count().run()
index="priority_by_site").count()
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
def queue(site_id):
app.logger.info("flask.request.args=%s", flask.request.args)
logging.debug("flask.request.args=%s", flask.request.args)
start = flask.request.args.get("start", 0)
end = flask.request.args.get("end", start + 90)
queue_ = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site")[start:end].run()
index="priority_by_site")[start:end]
logging.debug("querying rethinkdb: %s", reql)
queue_ = reql.run()
return flask.jsonify(queue_=list(queue_))
@app.route("/api/sites/<site_id>/pages_count")
@ -90,42 +82,51 @@ def queue(site_id):
@app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count")
def page_count(site_id):
count = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 1, False, r.minval],
[site_id, r.maxval, False, r.maxval],
index="priority_by_site").count().run()
index="priority_by_site").count()
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages")
def pages(site_id):
"""Pages already crawled."""
app.logger.info("flask.request.args=%s", flask.request.args)
start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90))
pages_ = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
index="least_hops").order_by(index="least_hops")[start:end].run()
index="least_hops").order_by(index="least_hops")[start:end]
logging.debug("querying rethinkdb: %s", reql)
pages_ = reql.run()
return flask.jsonify(pages=list(pages_))
@app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>")
def page(page_id):
page_ = r.table("pages").get(page_id).run()
reql = r.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return flask.jsonify(page_)
@app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml")
def page_yaml(page_id):
page_ = r.table("pages").get(page_id).run()
reql = r.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return app.response_class(
yaml.dump(page_, default_flow_style=False),
mimetype='application/yaml')
mimetype="application/yaml")
@app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>")
def site(site_id):
s = r.table("sites").get(site_id).run()
reql = r.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
s = reql.run()
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s)
@ -133,20 +134,30 @@ def site(site_id):
@app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml")
def site_yaml(site_id):
site_ = r.table("sites").get(site_id).run()
reql = r.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
site_ = reql.run()
return app.response_class(
yaml.dump(site_, default_flow_style=False),
mimetype='application/yaml')
mimetype="application/yaml")
@app.route("/api/stats/<bucket>")
def stats(bucket):
stats_ = r.table("stats").get(bucket).run()
reql = r.table("stats").get(bucket)
logging.debug("querying rethinkdb: %s", reql)
stats_ = reql.run()
return flask.jsonify(stats_)
@app.route("/api/jobs/<int:job_id>/sites")
@app.route("/api/job/<int:job_id>/sites")
@app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<job_id>/sites")
def sites(job_id):
sites_ = list(r.table("sites").get_all(job_id, index="job_id").run())
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("sites").get_all(jid, index="job_id")
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
@ -156,26 +167,40 @@ def sites(job_id):
@app.route("/api/jobless-sites")
def jobless_sites():
# XXX inefficient (unindexed) query
sites_ = list(r.table("sites").filter(~r.row.has_fields("job_id")).run())
reql = r.table("sites").filter(~r.row.has_fields("job_id"))
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<int:job_id>")
@app.route("/api/job/<int:job_id>")
@app.route("/api/jobs/<job_id>")
@app.route("/api/job/<job_id>")
def job(job_id):
job_ = r.table("jobs").get(job_id).run()
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return flask.jsonify(job_)
@app.route("/api/jobs/<int:job_id>/yaml")
@app.route("/api/job/<int:job_id>/yaml")
@app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<job_id>/yaml")
def job_yaml(job_id):
job_ = r.table("jobs").get(job_id).run()
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return app.response_class(
yaml.dump(job_, default_flow_style=False),
mimetype='application/yaml')
mimetype="application/yaml")
@app.route("/api/workers")
def workers():
@ -189,7 +214,9 @@ def services():
@app.route("/api/jobs")
def jobs():
jobs_ = list(r.table("jobs").order_by(rethinkdb.desc("id")).run())
reql = r.table("jobs").order_by(rethinkdb.desc("id"))
logging.debug("querying rethinkdb: %s", reql)
jobs_ = list(reql.run())
return flask.jsonify(jobs=jobs_)
@app.route("/api/config")
@ -209,6 +236,12 @@ def root(path):
try:
import gunicorn.app.base
from gunicorn.six import iteritems
import gunicorn.glogging
class BypassGunicornLogging(gunicorn.glogging.Logger):
def setup(self, cfg):
self.error_log.handlers = logging.root.handlers
self.access_log.handlers = logging.root.handlers
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
def __init__(self, app, options=None):
@ -222,21 +255,24 @@ try:
if key in self.cfg.settings and value is not None])
for key, value in iteritems(config):
self.cfg.set(key.lower(), value)
self.cfg.set("logger_class", BypassGunicornLogging)
self.cfg.set("accesslog", "dummy-value")
def load(self):
return self.application
def run(**options):
logging.info('running brozzler-dashboard using gunicorn')
logging.info("running brozzler-dashboard using gunicorn")
GunicornBrozzlerDashboard(app, options).run()
except ImportError:
def run():
logging.info('running brozzler-dashboard using simple flask app.run')
logging.info("running brozzler-dashboard using simple flask app.run")
app.run()
def main():
import argparse
import brozzler.cli
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
@ -246,13 +282,16 @@ def main():
epilog=(
'brozzler-dashboard has no command line options, but can be '
'configured using the following environment variables:\n\n'
' RETHINKDB_SERVERS rethinkdb servers, e.g. db0.foo.org,'
'db0.foo.org:38015,db1.foo.org (default: localhost)\n'
' RETHINKDB_DB rethinkdb database name (default: '
'brozzler)\n'
' BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: '
'localhost)\n'
' BROZZLER_RETHINKDB_DB rethinkdb database name '
'(default: brozzler)\n'
' WAYBACK_BASEURL base url for constructing wayback '
'links (default http://localhost:8091/brozzler)'))
'links (default http://localhost:8880/brozzler)'))
brozzler.cli.add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
brozzler.cli.configure_logging(args)
run()
if __name__ == "__main__":

View File

@ -1,7 +1,7 @@
/*
* brozzler/dashboard/static/js/app.js - brozzler dashboard angularjs code
*
* Copyright (C) 2014-2016 Internet Archive
* Copyright (C) 2014-2017 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -96,16 +96,12 @@ brozzlerControllers.controller("WorkersListController", ["$scope", "$http",
function statsSuccessCallback(site, bucket) {
return function(data) {
// console.log("site = ", site);
// console.log("/api/stats/" + bucket + " = ", data);
site.stats = data;
}
}
function pageCountSuccessCallback(site, job) {
return function(data) {
// console.log("site = ", site);
// console.log("/api/sites/" + site.id + "/page_count = ", data);
site.page_count = data.count;
if (job) {
job.page_count += data.count;
@ -115,8 +111,6 @@ function pageCountSuccessCallback(site, job) {
function queuedCountSuccessCallback(site, job) {
return function(data) {
// console.log("site = ", site);
// console.log("/api/sites/" + site.id + "/queued_count = ", data);
site.queued_count = data.count;
if (job) {
job.queued_count += data.count;
@ -129,41 +123,44 @@ function loadSiteStats($http, site, job) {
$http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job));
// look at Warcprox-Meta to find stats bucket
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
var bucket = site.warcprox_meta.stats.buckets[j];
if (typeof(bucket) == "object") {
bucket = bucket["bucket"];
}
if (bucket.indexOf("seed") >= 0) {
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
try {
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
var bucket = site.warcprox_meta.stats.buckets[j];
if (typeof(bucket) == "object") {
bucket = bucket["bucket"];
}
if (bucket.indexOf("seed") >= 0) {
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
}
}
} catch (e) {
// no stats bucket for this site
}
}
brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http",
function($scope, $routeParams, $http) {
$scope.show_yaml = false;
// console.log('JobController');
$http.get("/api/config").success(function(data) {
$scope.config = data.config;
});
$http.get("/api/jobs/" + $routeParams.id).success(function(data) {
$scope.job = data;
$scope.job.page_count = $scope.job.queued_count = 0;
// console.log("job=", $scope.job);
var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0];
if (typeof(bucket) == "object") {
bucket = bucket["bucket"];
try {
var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0];
if (typeof(bucket) == "object") {
bucket = bucket["bucket"];
}
$http.get("/api/stats/" + bucket).success(function(data) {
$scope.job.stats = data;
});
} catch (e) {
// no stats bucket for this job
}
$http.get("/api/stats/" + bucket).success(function(data) {
$scope.job.stats = data;
// console.log("job stats=", $scope.job.stats);
});
$http.get("/api/jobs/" + $routeParams.id + "/sites").success(function(data) {
$scope.sites = data.sites;
// console.log("sites=", $scope.sites);
for (var i = 0; i < $scope.sites.length; i++) {
loadSiteStats($http, $scope.sites[i], $scope.job);
}
@ -180,7 +177,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
$scope.loading = false;
$scope.pages = [];
$window.addEventListener("scroll", function() {
// console.log("window.scrollTop=" + window.scrollTop + " window.offsetHeight=" + window.offsetHeight + " window.scrollHeight=" + window.scrollHeight);
if ($window.innerHeight + $window.scrollY + 50 >= window.document.documentElement.scrollHeight) {
loadMorePages();
}
@ -191,10 +187,8 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
return;
$scope.loading = true;
// console.log("load more! start=" + start);
$http.get("/api/site/" + $routeParams.id + "/pages?start=" + start + "&end=" + (start+90)).then(function(response) {
$scope.pages = $scope.pages.concat(response.data.pages);
// console.log("pages = ", $scope.pages);
start += response.data.pages.length;
$scope.loading = false;
}, function(reason) {
@ -209,7 +203,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
$http.get("/api/site/" + $routeParams.id).success(function(data) {
$scope.site = data;
loadSiteStats($http, $scope.site);
// console.log("site = ", $scope.site);
});
$http.get("/api/site/" + $routeParams.id + "/yaml").success(function(data) {
$scope.site_yaml = data;

View File

@ -48,21 +48,14 @@ import socketserver
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
arg_parser = argparse.ArgumentParser(
prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=(
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
prog=prog, description=(
'brozzler-easy - easy deployment of brozzler, with '
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
'running in a single process'))
# common args
arg_parser.add_argument(
'--rethinkdb-servers', dest='rethinkdb_servers',
default='localhost', help=(
'rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org'))
arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler',
help='rethinkdb database name')
brozzler.cli.add_rethinkdb_options(arg_parser)
arg_parser.add_argument(
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
help='where to write warcs')
@ -114,18 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
type=int, default=8881, help='brozzler dashboard port')
# common at the bottom args
arg_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true',
help='verbose logging')
arg_parser.add_argument(
'-q', '--quiet', dest='quiet', action='store_true',
help='quiet logging (warnings and errors only)')
# arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const',
# default=logging.INFO, const=logging.CRITICAL)
arg_parser.add_argument(
'--version', action='version',
version='brozzler %s - %s' % (brozzler.__version__, prog))
brozzler.cli.add_common_options(arg_parser)
return arg_parser
@ -284,17 +266,7 @@ class BrozzlerEasyController:
def main():
arg_parser = _build_arg_parser()
args = arg_parser.parse_args(args=sys.argv[1:])
if args.verbose:
loglevel = logging.DEBUG
elif args.quiet:
loglevel = logging.WARNING
else:
loglevel = logging.INFO
logging.basicConfig(
level=loglevel, stream=sys.stderr, format=(
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
brozzler.cli.configure_logging(args)
controller = BrozzlerEasyController(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())

View File

@ -1,7 +1,7 @@
'''
brozzler/site.py - classes representing sites and pages
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -235,7 +235,7 @@ class Page(brozzler.BaseDictable):
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
redirect_url=None, priority=None, claimed=False, brozzle_count=0,
via_page_id=None, last_claimed_by=None, hops_off_surt=0,
outlinks=None, needs_robots_check=False):
outlinks=None, needs_robots_check=False, blocked_by_robots=None):
self.site_id = site_id
self.job_id = job_id
self.url = url
@ -248,6 +248,7 @@ class Page(brozzler.BaseDictable):
self.hops_off_surt = hops_off_surt
self.outlinks = outlinks
self.needs_robots_check = needs_robots_check
self.blocked_by_robots = blocked_by_robots
self._canon_hurl = None
if priority is not None:

View File

@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs youtube-dl on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -337,6 +337,7 @@ class BrozzlerWorker:
if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(site, page.url)):
logging.warn("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True
else:
outlinks = self.brozzle_page(browser, site, page)
self._frontier.scope_and_schedule_outlinks(

View File

@ -2,7 +2,7 @@
'''
setup.py - brozzler setup script
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev165',
version='1.1b9.dev176',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an alert</title>
<script>
alert("I'm an alert")
</script>
</head>
<body>
<h1>alert</h1>
<p>this is a page that pops up an alert</p>
</body>
</html>

View File

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an alert</title>
<script>
confirm("I'm a confirm dialog")
</script>
</head>
<body>
<h1>confirm</h1>
<p>this is a page that pops up a confirm modal dialog</p>
</body>
</html>

View File

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an print dialog</title>
<script>
print()
</script>
</head>
<body>
<h1>print</h1>
<p>this is a page that pops up a print dialog</p>
</body>
</html>

View File

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an prompt</title>
<script>
prompt("I'm a prompt")
</script>
</head>
<body>
<h1>prompt</h1>
<p>this is a page that pops up a prompt</p>
</body>
</html>

View File

@ -29,7 +29,7 @@ import json
args = argparse.Namespace()
args.log_level = logging.INFO
brozzler.cli._configure_logging(args)
brozzler.cli.configure_logging(args)
WARCPROX_META_420 = {
'stats': {
@ -114,7 +114,6 @@ def test_on_response(httpd):
url = 'http://localhost:%s/site3/page.html' % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(url, on_response=on_response)
browser.browse_page(url)
assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
@ -126,3 +125,20 @@ def test_420(httpd):
with pytest.raises(brozzler.ReachedLimit) as excinfo:
browser.browse_page(url)
assert excinfo.value.warcprox_meta == WARCPROX_META_420
def test_js_dialogs(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
# before commit d2ed6b97a24 these would hang and eventually raise
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
browser.browse_page(
'http://localhost:%s/site4/alert.html' % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/confirm.html' % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/prompt.html' % httpd.server_port)
# XXX print dialog unresolved
# browser.browse_page(
# 'http://localhost:%s/site4/print.html' % httpd.server_port)

View File

@ -3,7 +3,7 @@
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
warcprox, pywb, rethinkdb and other dependencies to be running already
Copyright (C) 2016 Internet Archive
Copyright (C) 2016-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -257,8 +257,9 @@ def test_obey_robots(httpd):
# check that only the one page is in rethinkdb
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
assert {page.url for page in pages} == {
'http://localhost:%s/site1/' % httpd.server_port}
page = pages[0]
assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
assert page.blocked_by_robots
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls