Merge branch 'qa' of github.com:internetarchive/brozzler into qa

This commit is contained in:
Barbara Miller 2017-01-30 20:52:58 -08:00
commit 6bf8cfe893
16 changed files with 297 additions and 184 deletions

View file

@ -17,3 +17,5 @@ after_failure:
notifications: notifications:
slack: slack:
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs= secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=
secure: jopAXO8j3AkNWhF02GIzlkHJmqcCfrDEDPHcLHwxGB1vKrJqfMtcmV1+JXv7jGPwT8hBkkZItD1fTbsA1UMTtZCsadhqwrH9sh/BtJy4mf1jDDK0Hq4bPdbpB/mHKBfjD+ZedPZphCiwRQm94QdMOAsmCsj1BluFn+ySHuNAnwyXCNohut5a3aFBszOwBNgZMwBmu+weAUpMrDbr/dhqOtU0IaNvhTJ2Ykyex7Of86L05lBI8MiGtq/J73uDiDINWViBXqG5+/LKIVLvnjzCxZOnOVtSVorRNY0OsClfLJILuWOXk0/C3p+lBCyq5iatWweNqcqqpMifUSdVp4x8GnPyvl4O5YuIZW674mpGmH6UW10MqEnqxFQIcZpArir/zToK/cIKsUse20n8U5LUgOSWeNM1RIBvc4ckeDuthjwvyfmP0hrnNxrPFxRez2J2r6alWFABvD0H83a3hn56AtGXqV+9gt9d4J0+vnBJkXMidQaORBnyRkPlTROxqkoK8r0PME8xr6GwDWHpUN7/Ibo9gS/zpA7zpJUIsAsevVKOSaITZwKqbCMTI3uy/tJcnzRUrnq5wqhh8vXlWzIxEvTW8vuIapjSvDzhnJga85bIEmoauyMd13gR/vhqXQ3xUdN5LeyXAPn24b5e2GNSrhDOaAs30tXe+Z31njSeKPM=

View file

@ -157,7 +157,10 @@ class WebsockReceiverThread(threading.Thread):
brozzler.thread_raise(self.calling_thread, BrowsingException) brozzler.thread_raise(self.calling_thread, BrowsingException)
def run(self): def run(self):
self.websock.run_forever() # ping_timeout is used as the timeout for the call to select.select()
# in addition to its documented purpose, and must have a value to avoid
# hangs in certain situations
self.websock.run_forever(ping_timeout=0.5)
def _on_message(self, websock, message): def _on_message(self, websock, message):
try: try:
@ -202,6 +205,17 @@ class WebsockReceiverThread(threading.Thread):
if self.on_response: if self.on_response:
self.on_response(message) self.on_response(message)
def _javascript_dialog_opening(self, message):
self.logger.info('javascript dialog opened: %s', message)
if message['params']['type'] == 'alert':
accept = True
else:
accept = False
self.websock.send(
json.dumps(dict(
id=0, method='Page.handleJavaScriptDialog',
params={'accept': accept})))
def _handle_message(self, websock, json_message): def _handle_message(self, websock, json_message):
message = json.loads(json_message) message = json.loads(json_message)
if 'method' in message: if 'method' in message:
@ -223,6 +237,8 @@ class WebsockReceiverThread(threading.Thread):
'%s console.%s %s', self.websock.url, '%s console.%s %s', self.websock.url,
message['params']['message']['level'], message['params']['message']['level'],
message['params']['message']['text']) message['params']['message']['text'])
elif message['method'] == 'Page.javascriptDialogOpening':
self._javascript_dialog_opening(message)
# else: # else:
# self.logger.debug("%s %s", message["method"], json_message) # self.logger.debug("%s %s", message["method"], json_message)
elif 'result' in message: elif 'result' in message:
@ -540,6 +556,7 @@ class Browser:
timeout=5) timeout=5)
msg = self.websock_thread.pop_result(msg_id) msg = self.websock_thread.pop_result(msg_id)
if (msg and 'result' in msg if (msg and 'result' in msg
and not ('exceptionDetails' in msg['result'])
and not ('wasThrown' in msg['result'] and not ('wasThrown' in msg['result']
and msg['result']['wasThrown']) and msg['result']['wasThrown'])
and 'result' in msg['result'] and 'result' in msg['result']

View file

@ -177,6 +177,7 @@ class Chrome:
json_url = 'http://localhost:%s/json' % self.port json_url = 'http://localhost:%s/json' % self.port
# make this a member variable so that kill -QUIT reports it # make this a member variable so that kill -QUIT reports it
self._start = time.time() self._start = time.time()
self._last_warning = self._start
while True: while True:
try: try:
raw_json = urllib.request.urlopen(json_url, timeout=30).read() raw_json = urllib.request.urlopen(json_url, timeout=30).read()
@ -194,11 +195,11 @@ class Chrome:
except brozzler.ShutdownRequested: except brozzler.ShutdownRequested:
raise raise
except BaseException as e: except BaseException as e:
if int(time.time() - self._start) % 10 == 5: if time.time() - self._last_warning > 30:
self.logger.warn( self.logger.warn(
'problem with %s (will keep trying until timeout ' 'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e) 'of %d seconds): %s', json_url, timeout_sec, e)
pass self._last_warning = time.time()
finally: finally:
if time.time() - self._start > timeout_sec: if time.time() - self._start > timeout_sec:
self.logger.error( self.logger.error(

View file

@ -2,7 +2,7 @@
''' '''
brozzler/cli.py - brozzler command line executables brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -38,16 +38,19 @@ import yaml
import shutil import shutil
import base64 import base64
def _add_common_options(arg_parser): def add_common_options(arg_parser):
arg_parser.add_argument( arg_parser.add_argument(
'-q', '--quiet', dest='log_level', '-q', '--quiet', dest='log_level', action='store_const',
action='store_const', default=logging.INFO, const=logging.WARN) default=logging.INFO, const=logging.WARN, help=(
'quiet logging, only warnings and errors'))
arg_parser.add_argument( arg_parser.add_argument(
'-v', '--verbose', dest='log_level', '-v', '--verbose', dest='log_level', action='store_const',
action='store_const', default=logging.INFO, const=logging.DEBUG) default=logging.INFO, const=logging.DEBUG, help=(
'verbose logging'))
arg_parser.add_argument( arg_parser.add_argument(
'--trace', dest='log_level', '--trace', dest='log_level', action='store_const',
action='store_const', default=logging.INFO, const=brozzler.TRACE) default=logging.INFO, const=brozzler.TRACE, help=(
'very verbose logging'))
# arg_parser.add_argument( # arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const', # '-s', '--silent', dest='log_level', action='store_const',
# default=logging.INFO, const=logging.CRITICAL) # default=logging.INFO, const=logging.CRITICAL)
@ -56,15 +59,26 @@ def _add_common_options(arg_parser):
version='brozzler %s - %s' % ( version='brozzler %s - %s' % (
brozzler.__version__, os.path.basename(sys.argv[0]))) brozzler.__version__, os.path.basename(sys.argv[0])))
def _add_rethinkdb_options(arg_parser): def add_rethinkdb_options(arg_parser):
arg_parser.add_argument( arg_parser.add_argument(
'--rethinkdb-servers', dest='rethinkdb_servers', '--rethinkdb-servers', dest='rethinkdb_servers',
default='localhost', help=( default=os.environ.get('BROZZLER_RETHINKDB_SERVERS', 'localhost'),
help=(
'rethinkdb servers, e.g. ' 'rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org')) 'db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the '
'value of environment variable BROZZLER_RETHINKDB_SERVERS)'))
arg_parser.add_argument( arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler', '--rethinkdb-db', dest='rethinkdb_db',
help='rethinkdb database name') default=os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
help=(
'rethinkdb database name (default is the value of environment '
'variable BROZZLER_RETHINKDB_DB)'))
def rethinker(args):
servers = args.rethinkdb_servers or 'localhost'
db = args.rethinkdb_db or os.environ.get(
'BROZZLER_RETHINKDB_DB') or 'brozzler'
return rethinkstuff.Rethinker(servers.split(','), db)
def _add_proxy_options(arg_parser): def _add_proxy_options(arg_parser):
arg_parser.add_argument( arg_parser.add_argument(
@ -75,7 +89,7 @@ def _add_proxy_options(arg_parser):
'enable special features that assume the configured proxy is ' 'enable special features that assume the configured proxy is '
'warcprox')) 'warcprox'))
def _configure_logging(args): def configure_logging(args):
logging.basicConfig( logging.basicConfig(
stream=sys.stderr, level=args.log_level, stream=sys.stderr, level=args.log_level,
format=( format=(
@ -107,6 +121,18 @@ def suggest_default_chrome_exe():
return exe return exe
return 'chromium-browser' return 'chromium-browser'
class BetterArgumentDefaultsHelpFormatter(
argparse.ArgumentDefaultsHelpFormatter):
'''
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
for arguments with action='store_const'.
'''
def _get_help_string(self, action):
if isinstance(action, argparse._StoreConstAction):
return action.help
else:
return super()._get_help_string(action)
def brozzle_page(): def brozzle_page():
''' '''
Command line utility entry point for brozzling a single page. Opens url in Command line utility entry point for brozzling a single page. Opens url in
@ -115,7 +141,7 @@ def brozzle_page():
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]), prog=os.path.basename(sys.argv[0]),
description='brozzle-page - brozzle a single page', description='brozzle-page - brozzle a single page',
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument('url', metavar='URL', help='page url')
arg_parser.add_argument( arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe', '-e', '--chrome-exe', dest='chrome_exe',
@ -141,10 +167,10 @@ def brozzle_page():
action='store_true', help=( action='store_true', help=(
'enable special features that assume the configured proxy ' 'enable special features that assume the configured proxy '
'is warcprox')) 'is warcprox'))
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
behavior_parameters = {} behavior_parameters = {}
if args.behavior_parameters: if args.behavior_parameters:
@ -191,14 +217,13 @@ def brozzler_new_job():
arg_parser.add_argument( arg_parser.add_argument(
'job_conf_file', metavar='JOB_CONF_FILE', 'job_conf_file', metavar='JOB_CONF_FILE',
help='brozzler job configuration file in yaml') help='brozzler job configuration file in yaml')
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
r = rethinkstuff.Rethinker( r = rethinker(args)
args.rethinkdb_servers.split(','), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r) frontier = brozzler.RethinkDbFrontier(r)
try: try:
brozzler.job.new_job_file(frontier, args.job_conf_file) brozzler.job.new_job_file(frontier, args.job_conf_file)
@ -218,7 +243,7 @@ def brozzler_new_site():
description='brozzler-new-site - register site to brozzle', description='brozzler-new-site - register site to brozzle',
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('seed', metavar='SEED', help='seed url') arg_parser.add_argument('seed', metavar='SEED', help='seed url')
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
_add_proxy_options(arg_parser) _add_proxy_options(arg_parser)
arg_parser.add_argument( arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None, '--time-limit', dest='time_limit', default=None,
@ -244,10 +269,10 @@ def brozzler_new_site():
arg_parser.add_argument( arg_parser.add_argument(
'--password', dest='password', default=None, '--password', dest='password', default=None,
help='use this password to try to log in if a login form is found') help='use this password to try to log in if a login form is found')
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
site = brozzler.Site( site = brozzler.Site(
seed=args.seed, proxy=args.proxy, seed=args.seed, proxy=args.proxy,
@ -260,8 +285,7 @@ def brozzler_new_site():
args.behavior_parameters) if args.behavior_parameters else None, args.behavior_parameters) if args.behavior_parameters else None,
username=args.username, password=args.password) username=args.username, password=args.password)
r = rethinkstuff.Rethinker( r = rethinker()
args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r) frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site) brozzler.new_site(frontier, site)
@ -273,7 +297,7 @@ def brozzler_worker():
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=os.path.basename(__file__), prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
arg_parser.add_argument( arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe', '-e', '--chrome-exe', dest='chrome_exe',
default=suggest_default_chrome_exe(), default=suggest_default_chrome_exe(),
@ -281,10 +305,10 @@ def brozzler_worker():
arg_parser.add_argument( arg_parser.add_argument(
'-n', '--max-browsers', dest='max_browsers', default='1', '-n', '--max-browsers', dest='max_browsers', default='1',
help='max number of chrome instances simultaneously browsing pages') help='max number of chrome instances simultaneously browsing pages')
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
def sigterm(signum, frame): def sigterm(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)') raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
@ -316,8 +340,7 @@ def brozzler_worker():
signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint) signal.signal(signal.SIGINT, sigint)
r = rethinkstuff.Rethinker( r = rethinker(args)
args.rethinkdb_servers.split(','), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r) frontier = brozzler.RethinkDbFrontier(r)
service_registry = rethinkstuff.ServiceRegistry(r) service_registry = rethinkstuff.ServiceRegistry(r)
worker = brozzler.worker.BrozzlerWorker( worker = brozzler.worker.BrozzlerWorker(
@ -339,14 +362,13 @@ def brozzler_ensure_tables():
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]), prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter) formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
r = rethinkstuff.Rethinker( r = rethinker(args)
args.rethinkdb_servers.split(','), args.rethinkdb_db)
# services table # services table
rethinkstuff.ServiceRegistry(r) rethinkstuff.ServiceRegistry(r)
@ -370,14 +392,13 @@ def brozzler_list_jobs():
arg_parser.add_argument( arg_parser.add_argument(
'-a', '--all', dest='all', action='store_true', help=( '-a', '--all', dest='all', action='store_true', help=(
'list all jobs (by default, only active jobs are listed)')) 'list all jobs (by default, only active jobs are listed)'))
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
r = rethinkstuff.Rethinker( r = rethinker(args)
args.rethinkdb_servers.split(','), args.rethinkdb_db)
reql = r.table('jobs').order_by('id') reql = r.table('jobs').order_by('id')
if not args.all: if not args.all:
reql = reql.filter({'status': 'ACTIVE'}) reql = reql.filter({'status': 'ACTIVE'})
@ -400,14 +421,13 @@ def brozzler_list_sites():
group.add_argument( group.add_argument(
'--job', dest='job', metavar='JOB_ID', help=( '--job', dest='job', metavar='JOB_ID', help=(
'list only sites for the supplied job')) 'list only sites for the supplied job'))
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
r = rethinkstuff.Rethinker( r = rethinker(args)
args.rethinkdb_servers.split(','), args.rethinkdb_db)
reql = r.table('sites') reql = r.table('sites')
if args.job: if args.job:
@ -447,14 +467,13 @@ def brozzler_list_pages():
'--claimed', dest='claimed', action='store_true', help=( '--claimed', dest='claimed', action='store_true', help=(
'limit only pages that are currently claimed by a brozzler ' 'limit only pages that are currently claimed by a brozzler '
'worker')) 'worker'))
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser) add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
r = rethinkstuff.Rethinker( r = rethinker(args)
args.rethinkdb_servers.split(','), args.rethinkdb_db)
if args.job: if args.job:
try: try:
job_id = int(args.job) job_id = int(args.job)
@ -507,17 +526,16 @@ def brozzler_list_captures():
'use prefix match for url (n.b. may not work as expected if ' 'use prefix match for url (n.b. may not work as expected if '
'searching key has query string because canonicalization can ' 'searching key has query string because canonicalization can '
'reorder query parameters)')) 'reorder query parameters)'))
_add_rethinkdb_options(arg_parser) add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser) add_common_options(arg_parser)
arg_parser.add_argument( arg_parser.add_argument(
'url_or_sha1', metavar='URL_or_SHA1', 'url_or_sha1', metavar='URL_or_SHA1',
help='url or sha1 to look up in captures table') help='url or sha1 to look up in captures table')
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args) configure_logging(args)
r = rethinkstuff.Rethinker( r = rethinker(args)
args.rethinkdb_servers.split(','), args.rethinkdb_db)
if args.url_or_sha1[:5] == 'sha1:': if args.url_or_sha1[:5] == 'sha1:':
if args.prefix: if args.prefix:

View file

@ -2,7 +2,7 @@
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
endspoints etc endspoints etc
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -35,27 +35,15 @@ import rethinkdb
import yaml import yaml
import base64 import base64
# flask does its own logging config
# logging.basicConfig(
# stream=sys.stdout, level=logging.INFO,
# format=(
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
app = flask.Flask(__name__) app = flask.Flask(__name__)
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
gunicorn_error_logger = logging.getLogger('gunicorn.error')
app.logger.handlers.extend(gunicorn_error_logger.handlers)
app.logger.setLevel(logging.INFO)
# configure with environment variables # configure with environment variables
SETTINGS = { SETTINGS = {
'RETHINKDB_SERVERS': os.environ.get( 'RETHINKDB_SERVERS': os.environ.get(
'RETHINKDB_SERVERS', 'localhost').split(','), 'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
'RETHINKDB_DB': os.environ.get('RETHINKDB_DB', 'brozzler'), 'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
'WAYBACK_BASEURL': os.environ.get( 'WAYBACK_BASEURL': os.environ.get(
'WAYBACK_BASEURL', 'http://localhost:8091/brozzler'), 'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
} }
r = rethinkstuff.Rethinker( r = rethinkstuff.Rethinker(
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB']) SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
@ -69,20 +57,24 @@ def service_registry():
@app.route("/api/sites/<site_id>/queued_count") @app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count") @app.route("/api/site/<site_id>/queued_count")
def queued_count(site_id): def queued_count(site_id):
count = r.table("pages").between( reql = r.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site").count().run() index="priority_by_site").count()
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count) return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/queue") @app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue") @app.route("/api/site/<site_id>/queue")
def queue(site_id): def queue(site_id):
app.logger.info("flask.request.args=%s", flask.request.args) logging.debug("flask.request.args=%s", flask.request.args)
start = flask.request.args.get("start", 0) start = flask.request.args.get("start", 0)
end = flask.request.args.get("end", start + 90) end = flask.request.args.get("end", start + 90)
queue_ = r.table("pages").between( reql = r.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site")[start:end].run() index="priority_by_site")[start:end]
logging.debug("querying rethinkdb: %s", reql)
queue_ = reql.run()
return flask.jsonify(queue_=list(queue_)) return flask.jsonify(queue_=list(queue_))
@app.route("/api/sites/<site_id>/pages_count") @app.route("/api/sites/<site_id>/pages_count")
@ -90,42 +82,51 @@ def queue(site_id):
@app.route("/api/sites/<site_id>/page_count") @app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count") @app.route("/api/site/<site_id>/page_count")
def page_count(site_id): def page_count(site_id):
count = r.table("pages").between( reql = r.table("pages").between(
[site_id, 1, False, r.minval], [site_id, 1, False, r.minval],
[site_id, r.maxval, False, r.maxval], [site_id, r.maxval, False, r.maxval],
index="priority_by_site").count().run() index="priority_by_site").count()
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count) return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/pages") @app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages") @app.route("/api/site/<site_id>/pages")
def pages(site_id): def pages(site_id):
"""Pages already crawled.""" """Pages already crawled."""
app.logger.info("flask.request.args=%s", flask.request.args)
start = int(flask.request.args.get("start", 0)) start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90)) end = int(flask.request.args.get("end", start + 90))
pages_ = r.table("pages").between( reql = r.table("pages").between(
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval], [site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
index="least_hops").order_by(index="least_hops")[start:end].run() index="least_hops").order_by(index="least_hops")[start:end]
logging.debug("querying rethinkdb: %s", reql)
pages_ = reql.run()
return flask.jsonify(pages=list(pages_)) return flask.jsonify(pages=list(pages_))
@app.route("/api/pages/<page_id>") @app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>") @app.route("/api/page/<page_id>")
def page(page_id): def page(page_id):
page_ = r.table("pages").get(page_id).run() reql = r.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return flask.jsonify(page_) return flask.jsonify(page_)
@app.route("/api/pages/<page_id>/yaml") @app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml") @app.route("/api/page/<page_id>/yaml")
def page_yaml(page_id): def page_yaml(page_id):
page_ = r.table("pages").get(page_id).run() reql = r.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return app.response_class( return app.response_class(
yaml.dump(page_, default_flow_style=False), yaml.dump(page_, default_flow_style=False),
mimetype='application/yaml') mimetype="application/yaml")
@app.route("/api/sites/<site_id>") @app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>") @app.route("/api/site/<site_id>")
def site(site_id): def site(site_id):
s = r.table("sites").get(site_id).run() reql = r.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
s = reql.run()
if "cookie_db" in s: if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s) return flask.jsonify(s)
@ -133,20 +134,30 @@ def site(site_id):
@app.route("/api/sites/<site_id>/yaml") @app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml") @app.route("/api/site/<site_id>/yaml")
def site_yaml(site_id): def site_yaml(site_id):
site_ = r.table("sites").get(site_id).run() reql = r.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
site_ = reql.run()
return app.response_class( return app.response_class(
yaml.dump(site_, default_flow_style=False), yaml.dump(site_, default_flow_style=False),
mimetype='application/yaml') mimetype="application/yaml")
@app.route("/api/stats/<bucket>") @app.route("/api/stats/<bucket>")
def stats(bucket): def stats(bucket):
stats_ = r.table("stats").get(bucket).run() reql = r.table("stats").get(bucket)
logging.debug("querying rethinkdb: %s", reql)
stats_ = reql.run()
return flask.jsonify(stats_) return flask.jsonify(stats_)
@app.route("/api/jobs/<int:job_id>/sites") @app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<int:job_id>/sites") @app.route("/api/job/<job_id>/sites")
def sites(job_id): def sites(job_id):
sites_ = list(r.table("sites").get_all(job_id, index="job_id").run()) try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("sites").get_all(jid, index="job_id")
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable # TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_: for s in sites_:
if "cookie_db" in s: if "cookie_db" in s:
@ -156,26 +167,40 @@ def sites(job_id):
@app.route("/api/jobless-sites") @app.route("/api/jobless-sites")
def jobless_sites(): def jobless_sites():
# XXX inefficient (unindexed) query # XXX inefficient (unindexed) query
sites_ = list(r.table("sites").filter(~r.row.has_fields("job_id")).run()) reql = r.table("sites").filter(~r.row.has_fields("job_id"))
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable # TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_: for s in sites_:
if "cookie_db" in s: if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_) return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<int:job_id>") @app.route("/api/jobs/<job_id>")
@app.route("/api/job/<int:job_id>") @app.route("/api/job/<job_id>")
def job(job_id): def job(job_id):
job_ = r.table("jobs").get(job_id).run() try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return flask.jsonify(job_) return flask.jsonify(job_)
@app.route("/api/jobs/<int:job_id>/yaml") @app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<int:job_id>/yaml") @app.route("/api/job/<job_id>/yaml")
def job_yaml(job_id): def job_yaml(job_id):
job_ = r.table("jobs").get(job_id).run() try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return app.response_class( return app.response_class(
yaml.dump(job_, default_flow_style=False), yaml.dump(job_, default_flow_style=False),
mimetype='application/yaml') mimetype="application/yaml")
@app.route("/api/workers") @app.route("/api/workers")
def workers(): def workers():
@ -189,7 +214,9 @@ def services():
@app.route("/api/jobs") @app.route("/api/jobs")
def jobs(): def jobs():
jobs_ = list(r.table("jobs").order_by(rethinkdb.desc("id")).run()) reql = r.table("jobs").order_by(rethinkdb.desc("id"))
logging.debug("querying rethinkdb: %s", reql)
jobs_ = list(reql.run())
return flask.jsonify(jobs=jobs_) return flask.jsonify(jobs=jobs_)
@app.route("/api/config") @app.route("/api/config")
@ -209,6 +236,12 @@ def root(path):
try: try:
import gunicorn.app.base import gunicorn.app.base
from gunicorn.six import iteritems from gunicorn.six import iteritems
import gunicorn.glogging
class BypassGunicornLogging(gunicorn.glogging.Logger):
def setup(self, cfg):
self.error_log.handlers = logging.root.handlers
self.access_log.handlers = logging.root.handlers
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication): class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
def __init__(self, app, options=None): def __init__(self, app, options=None):
@ -222,21 +255,24 @@ try:
if key in self.cfg.settings and value is not None]) if key in self.cfg.settings and value is not None])
for key, value in iteritems(config): for key, value in iteritems(config):
self.cfg.set(key.lower(), value) self.cfg.set(key.lower(), value)
self.cfg.set("logger_class", BypassGunicornLogging)
self.cfg.set("accesslog", "dummy-value")
def load(self): def load(self):
return self.application return self.application
def run(**options): def run(**options):
logging.info('running brozzler-dashboard using gunicorn') logging.info("running brozzler-dashboard using gunicorn")
GunicornBrozzlerDashboard(app, options).run() GunicornBrozzlerDashboard(app, options).run()
except ImportError: except ImportError:
def run(): def run():
logging.info('running brozzler-dashboard using simple flask app.run') logging.info("running brozzler-dashboard using simple flask app.run")
app.run() app.run()
def main(): def main():
import argparse import argparse
import brozzler.cli
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]), prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
@ -246,13 +282,16 @@ def main():
epilog=( epilog=(
'brozzler-dashboard has no command line options, but can be ' 'brozzler-dashboard has no command line options, but can be '
'configured using the following environment variables:\n\n' 'configured using the following environment variables:\n\n'
' RETHINKDB_SERVERS rethinkdb servers, e.g. db0.foo.org,' ' BROZZLER_RETHINKDB_SERVERS rethinkdb servers, e.g. '
'db0.foo.org:38015,db1.foo.org (default: localhost)\n' 'db0.foo.org,db0.foo.org:38015,db1.foo.org (default: '
' RETHINKDB_DB rethinkdb database name (default: ' 'localhost)\n'
'brozzler)\n' ' BROZZLER_RETHINKDB_DB rethinkdb database name '
'(default: brozzler)\n'
' WAYBACK_BASEURL base url for constructing wayback ' ' WAYBACK_BASEURL base url for constructing wayback '
'links (default http://localhost:8091/brozzler)')) 'links (default http://localhost:8880/brozzler)'))
brozzler.cli.add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
brozzler.cli.configure_logging(args)
run() run()
if __name__ == "__main__": if __name__ == "__main__":

View file

@ -1,7 +1,7 @@
/* /*
* brozzler/dashboard/static/js/app.js - brozzler dashboard angularjs code * brozzler/dashboard/static/js/app.js - brozzler dashboard angularjs code
* *
* Copyright (C) 2014-2016 Internet Archive * Copyright (C) 2014-2017 Internet Archive
* *
* Licensed under the Apache License, Version 2.0 (the "License"); * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. * you may not use this file except in compliance with the License.
@ -96,16 +96,12 @@ brozzlerControllers.controller("WorkersListController", ["$scope", "$http",
function statsSuccessCallback(site, bucket) { function statsSuccessCallback(site, bucket) {
return function(data) { return function(data) {
// console.log("site = ", site);
// console.log("/api/stats/" + bucket + " = ", data);
site.stats = data; site.stats = data;
} }
} }
function pageCountSuccessCallback(site, job) { function pageCountSuccessCallback(site, job) {
return function(data) { return function(data) {
// console.log("site = ", site);
// console.log("/api/sites/" + site.id + "/page_count = ", data);
site.page_count = data.count; site.page_count = data.count;
if (job) { if (job) {
job.page_count += data.count; job.page_count += data.count;
@ -115,8 +111,6 @@ function pageCountSuccessCallback(site, job) {
function queuedCountSuccessCallback(site, job) { function queuedCountSuccessCallback(site, job) {
return function(data) { return function(data) {
// console.log("site = ", site);
// console.log("/api/sites/" + site.id + "/queued_count = ", data);
site.queued_count = data.count; site.queued_count = data.count;
if (job) { if (job) {
job.queued_count += data.count; job.queued_count += data.count;
@ -129,41 +123,44 @@ function loadSiteStats($http, site, job) {
$http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job)); $http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job));
// look at Warcprox-Meta to find stats bucket // look at Warcprox-Meta to find stats bucket
try {
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) { for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
var bucket = site.warcprox_meta.stats.buckets[j]; var bucket = site.warcprox_meta.stats.buckets[j];
if (typeof(bucket) == "object") { if (typeof(bucket) == "object") {
bucket = bucket["bucket"]; bucket = bucket["bucket"];
} }
if (bucket.indexOf("seed") >= 0) { if (bucket.indexOf("seed") >= 0) {
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket)); $http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
} }
} }
} catch (e) {
// no stats bucket for this site
}
} }
brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http", brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http",
function($scope, $routeParams, $http) { function($scope, $routeParams, $http) {
$scope.show_yaml = false; $scope.show_yaml = false;
// console.log('JobController');
$http.get("/api/config").success(function(data) { $http.get("/api/config").success(function(data) {
$scope.config = data.config; $scope.config = data.config;
}); });
$http.get("/api/jobs/" + $routeParams.id).success(function(data) { $http.get("/api/jobs/" + $routeParams.id).success(function(data) {
$scope.job = data; $scope.job = data;
$scope.job.page_count = $scope.job.queued_count = 0; $scope.job.page_count = $scope.job.queued_count = 0;
// console.log("job=", $scope.job); try {
var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0]; var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0];
if (typeof(bucket) == "object") { if (typeof(bucket) == "object") {
bucket = bucket["bucket"]; bucket = bucket["bucket"];
} }
$http.get("/api/stats/" + bucket).success(function(data) { $http.get("/api/stats/" + bucket).success(function(data) {
$scope.job.stats = data; $scope.job.stats = data;
// console.log("job stats=", $scope.job.stats);
}); });
} catch (e) {
// no stats bucket for this job
}
$http.get("/api/jobs/" + $routeParams.id + "/sites").success(function(data) { $http.get("/api/jobs/" + $routeParams.id + "/sites").success(function(data) {
$scope.sites = data.sites; $scope.sites = data.sites;
// console.log("sites=", $scope.sites);
for (var i = 0; i < $scope.sites.length; i++) { for (var i = 0; i < $scope.sites.length; i++) {
loadSiteStats($http, $scope.sites[i], $scope.job); loadSiteStats($http, $scope.sites[i], $scope.job);
} }
@ -180,7 +177,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
$scope.loading = false; $scope.loading = false;
$scope.pages = []; $scope.pages = [];
$window.addEventListener("scroll", function() { $window.addEventListener("scroll", function() {
// console.log("window.scrollTop=" + window.scrollTop + " window.offsetHeight=" + window.offsetHeight + " window.scrollHeight=" + window.scrollHeight);
if ($window.innerHeight + $window.scrollY + 50 >= window.document.documentElement.scrollHeight) { if ($window.innerHeight + $window.scrollY + 50 >= window.document.documentElement.scrollHeight) {
loadMorePages(); loadMorePages();
} }
@ -191,10 +187,8 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
return; return;
$scope.loading = true; $scope.loading = true;
// console.log("load more! start=" + start);
$http.get("/api/site/" + $routeParams.id + "/pages?start=" + start + "&end=" + (start+90)).then(function(response) { $http.get("/api/site/" + $routeParams.id + "/pages?start=" + start + "&end=" + (start+90)).then(function(response) {
$scope.pages = $scope.pages.concat(response.data.pages); $scope.pages = $scope.pages.concat(response.data.pages);
// console.log("pages = ", $scope.pages);
start += response.data.pages.length; start += response.data.pages.length;
$scope.loading = false; $scope.loading = false;
}, function(reason) { }, function(reason) {
@ -209,7 +203,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
$http.get("/api/site/" + $routeParams.id).success(function(data) { $http.get("/api/site/" + $routeParams.id).success(function(data) {
$scope.site = data; $scope.site = data;
loadSiteStats($http, $scope.site); loadSiteStats($http, $scope.site);
// console.log("site = ", $scope.site);
}); });
$http.get("/api/site/" + $routeParams.id + "/yaml").success(function(data) { $http.get("/api/site/" + $routeParams.id + "/yaml").success(function(data) {
$scope.site_yaml = data; $scope.site_yaml = data;

View file

@ -48,21 +48,14 @@ import socketserver
def _build_arg_parser(prog=os.path.basename(sys.argv[0])): def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
arg_parser = argparse.ArgumentParser( arg_parser = argparse.ArgumentParser(
prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter, formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
description=( prog=prog, description=(
'brozzler-easy - easy deployment of brozzler, with ' 'brozzler-easy - easy deployment of brozzler, with '
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all ' 'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
'running in a single process')) 'running in a single process'))
# common args # common args
arg_parser.add_argument( brozzler.cli.add_rethinkdb_options(arg_parser)
'--rethinkdb-servers', dest='rethinkdb_servers',
default='localhost', help=(
'rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org'))
arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler',
help='rethinkdb database name')
arg_parser.add_argument( arg_parser.add_argument(
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs', '-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
help='where to write warcs') help='where to write warcs')
@ -114,18 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
type=int, default=8881, help='brozzler dashboard port') type=int, default=8881, help='brozzler dashboard port')
# common at the bottom args # common at the bottom args
arg_parser.add_argument( brozzler.cli.add_common_options(arg_parser)
'-v', '--verbose', dest='verbose', action='store_true',
help='verbose logging')
arg_parser.add_argument(
'-q', '--quiet', dest='quiet', action='store_true',
help='quiet logging (warnings and errors only)')
# arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const',
# default=logging.INFO, const=logging.CRITICAL)
arg_parser.add_argument(
'--version', action='version',
version='brozzler %s - %s' % (brozzler.__version__, prog))
return arg_parser return arg_parser
@ -284,17 +266,7 @@ class BrozzlerEasyController:
def main(): def main():
arg_parser = _build_arg_parser() arg_parser = _build_arg_parser()
args = arg_parser.parse_args(args=sys.argv[1:]) args = arg_parser.parse_args(args=sys.argv[1:])
if args.verbose: brozzler.cli.configure_logging(args)
loglevel = logging.DEBUG
elif args.quiet:
loglevel = logging.WARNING
else:
loglevel = logging.INFO
logging.basicConfig(
level=loglevel, stream=sys.stderr, format=(
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
controller = BrozzlerEasyController(args) controller = BrozzlerEasyController(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())

View file

@ -1,7 +1,7 @@
''' '''
brozzler/site.py - classes representing sites and pages brozzler/site.py - classes representing sites and pages
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -235,7 +235,7 @@ class Page(brozzler.BaseDictable):
self, url, id=None, site_id=None, job_id=None, hops_from_seed=0, self, url, id=None, site_id=None, job_id=None, hops_from_seed=0,
redirect_url=None, priority=None, claimed=False, brozzle_count=0, redirect_url=None, priority=None, claimed=False, brozzle_count=0,
via_page_id=None, last_claimed_by=None, hops_off_surt=0, via_page_id=None, last_claimed_by=None, hops_off_surt=0,
outlinks=None, needs_robots_check=False): outlinks=None, needs_robots_check=False, blocked_by_robots=None):
self.site_id = site_id self.site_id = site_id
self.job_id = job_id self.job_id = job_id
self.url = url self.url = url
@ -248,6 +248,7 @@ class Page(brozzler.BaseDictable):
self.hops_off_surt = hops_off_surt self.hops_off_surt = hops_off_surt
self.outlinks = outlinks self.outlinks = outlinks
self.needs_robots_check = needs_robots_check self.needs_robots_check = needs_robots_check
self.blocked_by_robots = blocked_by_robots
self._canon_hurl = None self._canon_hurl = None
if priority is not None: if priority is not None:

View file

@ -3,7 +3,7 @@ brozzler/worker.py - BrozzlerWorker brozzles pages from the frontier, meaning
it runs youtube-dl on them, browses them and runs behaviors if appropriate, it runs youtube-dl on them, browses them and runs behaviors if appropriate,
scopes and adds outlinks to the frontier scopes and adds outlinks to the frontier
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -337,6 +337,7 @@ class BrozzlerWorker:
if (page.needs_robots_check and if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(site, page.url)): not brozzler.is_permitted_by_robots(site, page.url)):
logging.warn("page %s is blocked by robots.txt", page.url) logging.warn("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True
else: else:
outlinks = self.brozzle_page(browser, site, page) outlinks = self.brozzle_page(browser, site, page)
self._frontier.scope_and_schedule_outlinks( self._frontier.scope_and_schedule_outlinks(

View file

@ -2,7 +2,7 @@
''' '''
setup.py - brozzler setup script setup.py - brozzler setup script
Copyright (C) 2014-2016 Internet Archive Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev165', version='1.1b9.dev176',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View file

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an alert</title>
<script>
alert("I'm an alert")
</script>
</head>
<body>
<h1>alert</h1>
<p>this is a page that pops up an alert</p>
</body>
</html>

View file

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an alert</title>
<script>
confirm("I'm a confirm dialog")
</script>
</head>
<body>
<h1>confirm</h1>
<p>this is a page that pops up a confirm modal dialog</p>
</body>
</html>

View file

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an print dialog</title>
<script>
print()
</script>
</head>
<body>
<h1>print</h1>
<p>this is a page that pops up a print dialog</p>
</body>
</html>

View file

@ -0,0 +1,13 @@
<html>
<head>
<title>a page that pops up an prompt</title>
<script>
prompt("I'm a prompt")
</script>
</head>
<body>
<h1>prompt</h1>
<p>this is a page that pops up a prompt</p>
</body>
</html>

View file

@ -29,7 +29,7 @@ import json
args = argparse.Namespace() args = argparse.Namespace()
args.log_level = logging.INFO args.log_level = logging.INFO
brozzler.cli._configure_logging(args) brozzler.cli.configure_logging(args)
WARCPROX_META_420 = { WARCPROX_META_420 = {
'stats': { 'stats': {
@ -114,7 +114,6 @@ def test_on_response(httpd):
url = 'http://localhost:%s/site3/page.html' % httpd.server_port url = 'http://localhost:%s/site3/page.html' % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser: with brozzler.Browser(chrome_exe=chrome_exe) as browser:
browser.browse_page(url, on_response=on_response) browser.browse_page(url, on_response=on_response)
browser.browse_page(url)
assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
@ -126,3 +125,20 @@ def test_420(httpd):
with pytest.raises(brozzler.ReachedLimit) as excinfo: with pytest.raises(brozzler.ReachedLimit) as excinfo:
browser.browse_page(url) browser.browse_page(url)
assert excinfo.value.warcprox_meta == WARCPROX_META_420 assert excinfo.value.warcprox_meta == WARCPROX_META_420
def test_js_dialogs(httpd):
chrome_exe = brozzler.suggest_default_chrome_exe()
url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
with brozzler.Browser(chrome_exe=chrome_exe) as browser:
# before commit d2ed6b97a24 these would hang and eventually raise
# brozzler.browser.BrowsingTimeout, which would cause this test to fail
browser.browse_page(
'http://localhost:%s/site4/alert.html' % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/confirm.html' % httpd.server_port)
browser.browse_page(
'http://localhost:%s/site4/prompt.html' % httpd.server_port)
# XXX print dialog unresolved
# browser.browse_page(
# 'http://localhost:%s/site4/print.html' % httpd.server_port)

View file

@ -3,7 +3,7 @@
test_cluster.py - integration tests for a brozzler cluster, expects brozzler, test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
warcprox, pywb, rethinkdb and other dependencies to be running already warcprox, pywb, rethinkdb and other dependencies to be running already
Copyright (C) 2016 Internet Archive Copyright (C) 2016-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -257,8 +257,9 @@ def test_obey_robots(httpd):
# check that only the one page is in rethinkdb # check that only the one page is in rethinkdb
pages = list(frontier.site_pages(site.id)) pages = list(frontier.site_pages(site.id))
assert len(pages) == 1 assert len(pages) == 1
assert {page.url for page in pages} == { page = pages[0]
'http://localhost:%s/site1/' % httpd.server_port} assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
assert page.blocked_by_robots
# take a look at the captures table # take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls