mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 08:06:27 -04:00
Merge branch 'master' into qa
* master: restore ping_timeout argument to WebSocketApp.run_forever to fix problem of leaking websocket receiver threads hanging forever on select() missed a spot improve brozzler-dashboard logging; fix default wayback baseurl in brozzler dashboard (https://github.com/internetarchive/brozzler/issues/31); tweak arg parsing related stuff avoid js errors in case site or job is not configured to keep stats add travis-ci slack notification to internetarchive/brozzler channel
This commit is contained in:
commit
8c116295ea
@ -17,3 +17,5 @@ after_failure:
|
||||
notifications:
|
||||
slack:
|
||||
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=
|
||||
secure: jopAXO8j3AkNWhF02GIzlkHJmqcCfrDEDPHcLHwxGB1vKrJqfMtcmV1+JXv7jGPwT8hBkkZItD1fTbsA1UMTtZCsadhqwrH9sh/BtJy4mf1jDDK0Hq4bPdbpB/mHKBfjD+ZedPZphCiwRQm94QdMOAsmCsj1BluFn+ySHuNAnwyXCNohut5a3aFBszOwBNgZMwBmu+weAUpMrDbr/dhqOtU0IaNvhTJ2Ykyex7Of86L05lBI8MiGtq/J73uDiDINWViBXqG5+/LKIVLvnjzCxZOnOVtSVorRNY0OsClfLJILuWOXk0/C3p+lBCyq5iatWweNqcqqpMifUSdVp4x8GnPyvl4O5YuIZW674mpGmH6UW10MqEnqxFQIcZpArir/zToK/cIKsUse20n8U5LUgOSWeNM1RIBvc4ckeDuthjwvyfmP0hrnNxrPFxRez2J2r6alWFABvD0H83a3hn56AtGXqV+9gt9d4J0+vnBJkXMidQaORBnyRkPlTROxqkoK8r0PME8xr6GwDWHpUN7/Ibo9gS/zpA7zpJUIsAsevVKOSaITZwKqbCMTI3uy/tJcnzRUrnq5wqhh8vXlWzIxEvTW8vuIapjSvDzhnJga85bIEmoauyMd13gR/vhqXQ3xUdN5LeyXAPn24b5e2GNSrhDOaAs30tXe+Z31njSeKPM=
|
||||
|
||||
|
@ -157,7 +157,10 @@ class WebsockReceiverThread(threading.Thread):
|
||||
brozzler.thread_raise(self.calling_thread, BrowsingException)
|
||||
|
||||
def run(self):
|
||||
self.websock.run_forever()
|
||||
# ping_timeout is used as the timeout for the call to select.select()
|
||||
# in addition to its documented purpose, and must have a value to avoid
|
||||
# hangs in certain situations
|
||||
self.websock.run_forever(ping_timeout=0.5)
|
||||
|
||||
def _on_message(self, websock, message):
|
||||
try:
|
||||
|
108
brozzler/cli.py
108
brozzler/cli.py
@ -2,7 +2,7 @@
|
||||
'''
|
||||
brozzler/cli.py - brozzler command line executables
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -38,16 +38,19 @@ import yaml
|
||||
import shutil
|
||||
import base64
|
||||
|
||||
def _add_common_options(arg_parser):
|
||||
def add_common_options(arg_parser):
|
||||
arg_parser.add_argument(
|
||||
'-q', '--quiet', dest='log_level',
|
||||
action='store_const', default=logging.INFO, const=logging.WARN)
|
||||
'-q', '--quiet', dest='log_level', action='store_const',
|
||||
default=logging.INFO, const=logging.WARN, help=(
|
||||
'quiet logging, only warnings and errors'))
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='log_level',
|
||||
action='store_const', default=logging.INFO, const=logging.DEBUG)
|
||||
'-v', '--verbose', dest='log_level', action='store_const',
|
||||
default=logging.INFO, const=logging.DEBUG, help=(
|
||||
'verbose logging'))
|
||||
arg_parser.add_argument(
|
||||
'--trace', dest='log_level',
|
||||
action='store_const', default=logging.INFO, const=brozzler.TRACE)
|
||||
'--trace', dest='log_level', action='store_const',
|
||||
default=logging.INFO, const=brozzler.TRACE, help=(
|
||||
'very verbose logging'))
|
||||
# arg_parser.add_argument(
|
||||
# '-s', '--silent', dest='log_level', action='store_const',
|
||||
# default=logging.INFO, const=logging.CRITICAL)
|
||||
@ -56,20 +59,23 @@ def _add_common_options(arg_parser):
|
||||
version='brozzler %s - %s' % (
|
||||
brozzler.__version__, os.path.basename(sys.argv[0])))
|
||||
|
||||
def _add_rethinkdb_options(arg_parser):
|
||||
def add_rethinkdb_options(arg_parser):
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-servers', dest='rethinkdb_servers', help=(
|
||||
'--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
default=os.environ.get('BROZZLER_RETHINKDB_SERVERS', 'localhost'),
|
||||
help=(
|
||||
'rethinkdb servers, e.g. '
|
||||
'db0.foo.org,db0.foo.org:38015,db1.foo.org (takes precedence '
|
||||
'over environment variable BROZZLER_RETHINKDB_SERVERS)'))
|
||||
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the '
|
||||
'value of environment variable BROZZLER_RETHINKDB_SERVERS)'))
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-db', dest='rethinkdb_db', help=(
|
||||
'rethinkdb database name (takes precedence over '
|
||||
'environment variable BROZZLER_RETHINKDB_DB)'))
|
||||
'--rethinkdb-db', dest='rethinkdb_db',
|
||||
default=os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
|
||||
help=(
|
||||
'rethinkdb database name (default is the value of environment '
|
||||
'variable BROZZLER_RETHINKDB_DB)'))
|
||||
|
||||
def rethinker(args):
|
||||
servers = args.rethinkdb_servers or os.environ.get(
|
||||
'BROZZLER_RETHINKDB_SERVERS') or 'localhost'
|
||||
servers = args.rethinkdb_servers or 'localhost'
|
||||
db = args.rethinkdb_db or os.environ.get(
|
||||
'BROZZLER_RETHINKDB_DB') or 'brozzler'
|
||||
return rethinkstuff.Rethinker(servers.split(','), db)
|
||||
@ -83,7 +89,7 @@ def _add_proxy_options(arg_parser):
|
||||
'enable special features that assume the configured proxy is '
|
||||
'warcprox'))
|
||||
|
||||
def _configure_logging(args):
|
||||
def configure_logging(args):
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=args.log_level,
|
||||
format=(
|
||||
@ -115,6 +121,18 @@ def suggest_default_chrome_exe():
|
||||
return exe
|
||||
return 'chromium-browser'
|
||||
|
||||
class BetterArgumentDefaultsHelpFormatter(
|
||||
argparse.ArgumentDefaultsHelpFormatter):
|
||||
'''
|
||||
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
|
||||
for arguments with action='store_const'.
|
||||
'''
|
||||
def _get_help_string(self, action):
|
||||
if isinstance(action, argparse._StoreConstAction):
|
||||
return action.help
|
||||
else:
|
||||
return super()._get_help_string(action)
|
||||
|
||||
def brozzle_page():
|
||||
'''
|
||||
Command line utility entry point for brozzling a single page. Opens url in
|
||||
@ -123,7 +141,7 @@ def brozzle_page():
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
description='brozzle-page - brozzle a single page',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('url', metavar='URL', help='page url')
|
||||
arg_parser.add_argument(
|
||||
'-e', '--chrome-exe', dest='chrome_exe',
|
||||
@ -149,10 +167,10 @@ def brozzle_page():
|
||||
action='store_true', help=(
|
||||
'enable special features that assume the configured proxy '
|
||||
'is warcprox'))
|
||||
_add_common_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
behavior_parameters = {}
|
||||
if args.behavior_parameters:
|
||||
@ -199,11 +217,11 @@ def brozzler_new_job():
|
||||
arg_parser.add_argument(
|
||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
||||
help='brozzler job configuration file in yaml')
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
r = rethinker(args)
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
@ -225,7 +243,7 @@ def brozzler_new_site():
|
||||
description='brozzler-new-site - register site to brozzle',
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
_add_proxy_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'--time-limit', dest='time_limit', default=None,
|
||||
@ -251,10 +269,10 @@ def brozzler_new_site():
|
||||
arg_parser.add_argument(
|
||||
'--password', dest='password', default=None,
|
||||
help='use this password to try to log in if a login form is found')
|
||||
_add_common_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
site = brozzler.Site(
|
||||
seed=args.seed, proxy=args.proxy,
|
||||
@ -279,7 +297,7 @@ def brozzler_worker():
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(__file__),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'-e', '--chrome-exe', dest='chrome_exe',
|
||||
default=suggest_default_chrome_exe(),
|
||||
@ -287,10 +305,10 @@ def brozzler_worker():
|
||||
arg_parser.add_argument(
|
||||
'-n', '--max-browsers', dest='max_browsers', default='1',
|
||||
help='max number of chrome instances simultaneously browsing pages')
|
||||
_add_common_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
def sigterm(signum, frame):
|
||||
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
|
||||
@ -344,11 +362,11 @@ def brozzler_ensure_tables():
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
r = rethinker(args)
|
||||
|
||||
@ -374,11 +392,11 @@ def brozzler_list_jobs():
|
||||
arg_parser.add_argument(
|
||||
'-a', '--all', dest='all', action='store_true', help=(
|
||||
'list all jobs (by default, only active jobs are listed)'))
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
r = rethinker(args)
|
||||
reql = r.table('jobs').order_by('id')
|
||||
@ -403,11 +421,11 @@ def brozzler_list_sites():
|
||||
group.add_argument(
|
||||
'--job', dest='job', metavar='JOB_ID', help=(
|
||||
'list only sites for the supplied job'))
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
r = rethinker(args)
|
||||
|
||||
@ -449,11 +467,11 @@ def brozzler_list_pages():
|
||||
'--claimed', dest='claimed', action='store_true', help=(
|
||||
'limit only pages that are currently claimed by a brozzler '
|
||||
'worker'))
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
r = rethinker(args)
|
||||
if args.job:
|
||||
@ -508,14 +526,14 @@ def brozzler_list_captures():
|
||||
'use prefix match for url (n.b. may not work as expected if '
|
||||
'searching key has query string because canonicalization can '
|
||||
'reorder query parameters)'))
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
_add_common_options(arg_parser)
|
||||
add_rethinkdb_options(arg_parser)
|
||||
add_common_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'url_or_sha1', metavar='URL_or_SHA1',
|
||||
help='url or sha1 to look up in captures table')
|
||||
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
_configure_logging(args)
|
||||
configure_logging(args)
|
||||
|
||||
r = rethinker(args)
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
|
||||
endspoints etc
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -35,27 +35,15 @@ import rethinkdb
|
||||
import yaml
|
||||
import base64
|
||||
|
||||
# flask does its own logging config
|
||||
# logging.basicConfig(
|
||||
# stream=sys.stdout, level=logging.INFO,
|
||||
# format=(
|
||||
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
||||
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
||||
|
||||
app = flask.Flask(__name__)
|
||||
|
||||
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
|
||||
gunicorn_error_logger = logging.getLogger('gunicorn.error')
|
||||
app.logger.handlers.extend(gunicorn_error_logger.handlers)
|
||||
app.logger.setLevel(logging.INFO)
|
||||
|
||||
# configure with environment variables
|
||||
SETTINGS = {
|
||||
'RETHINKDB_SERVERS': os.environ.get(
|
||||
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
|
||||
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
|
||||
'WAYBACK_BASEURL': os.environ.get(
|
||||
'WAYBACK_BASEURL', 'http://localhost:8091/brozzler'),
|
||||
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
|
||||
}
|
||||
r = rethinkstuff.Rethinker(
|
||||
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
|
||||
@ -69,20 +57,24 @@ def service_registry():
|
||||
@app.route("/api/sites/<site_id>/queued_count")
|
||||
@app.route("/api/site/<site_id>/queued_count")
|
||||
def queued_count(site_id):
|
||||
count = r.table("pages").between(
|
||||
reql = r.table("pages").between(
|
||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
||||
index="priority_by_site").count().run()
|
||||
index="priority_by_site").count()
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
count = reql.run()
|
||||
return flask.jsonify(count=count)
|
||||
|
||||
@app.route("/api/sites/<site_id>/queue")
|
||||
@app.route("/api/site/<site_id>/queue")
|
||||
def queue(site_id):
|
||||
app.logger.info("flask.request.args=%s", flask.request.args)
|
||||
logging.debug("flask.request.args=%s", flask.request.args)
|
||||
start = flask.request.args.get("start", 0)
|
||||
end = flask.request.args.get("end", start + 90)
|
||||
queue_ = r.table("pages").between(
|
||||
reql = r.table("pages").between(
|
||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
||||
index="priority_by_site")[start:end].run()
|
||||
index="priority_by_site")[start:end]
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
queue_ = reql.run()
|
||||
return flask.jsonify(queue_=list(queue_))
|
||||
|
||||
@app.route("/api/sites/<site_id>/pages_count")
|
||||
@ -90,42 +82,51 @@ def queue(site_id):
|
||||
@app.route("/api/sites/<site_id>/page_count")
|
||||
@app.route("/api/site/<site_id>/page_count")
|
||||
def page_count(site_id):
|
||||
count = r.table("pages").between(
|
||||
reql = r.table("pages").between(
|
||||
[site_id, 1, False, r.minval],
|
||||
[site_id, r.maxval, False, r.maxval],
|
||||
index="priority_by_site").count().run()
|
||||
index="priority_by_site").count()
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
count = reql.run()
|
||||
return flask.jsonify(count=count)
|
||||
|
||||
@app.route("/api/sites/<site_id>/pages")
|
||||
@app.route("/api/site/<site_id>/pages")
|
||||
def pages(site_id):
|
||||
"""Pages already crawled."""
|
||||
app.logger.info("flask.request.args=%s", flask.request.args)
|
||||
start = int(flask.request.args.get("start", 0))
|
||||
end = int(flask.request.args.get("end", start + 90))
|
||||
pages_ = r.table("pages").between(
|
||||
reql = r.table("pages").between(
|
||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||
index="least_hops").order_by(index="least_hops")[start:end].run()
|
||||
index="least_hops").order_by(index="least_hops")[start:end]
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
pages_ = reql.run()
|
||||
return flask.jsonify(pages=list(pages_))
|
||||
|
||||
@app.route("/api/pages/<page_id>")
|
||||
@app.route("/api/page/<page_id>")
|
||||
def page(page_id):
|
||||
page_ = r.table("pages").get(page_id).run()
|
||||
reql = r.table("pages").get(page_id)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
page_ = reql.run()
|
||||
return flask.jsonify(page_)
|
||||
|
||||
@app.route("/api/pages/<page_id>/yaml")
|
||||
@app.route("/api/page/<page_id>/yaml")
|
||||
def page_yaml(page_id):
|
||||
page_ = r.table("pages").get(page_id).run()
|
||||
reql = r.table("pages").get(page_id)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
page_ = reql.run()
|
||||
return app.response_class(
|
||||
yaml.dump(page_, default_flow_style=False),
|
||||
mimetype='application/yaml')
|
||||
mimetype="application/yaml")
|
||||
|
||||
@app.route("/api/sites/<site_id>")
|
||||
@app.route("/api/site/<site_id>")
|
||||
def site(site_id):
|
||||
s = r.table("sites").get(site_id).run()
|
||||
reql = r.table("sites").get(site_id)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
s = reql.run()
|
||||
if "cookie_db" in s:
|
||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||
return flask.jsonify(s)
|
||||
@ -133,20 +134,30 @@ def site(site_id):
|
||||
@app.route("/api/sites/<site_id>/yaml")
|
||||
@app.route("/api/site/<site_id>/yaml")
|
||||
def site_yaml(site_id):
|
||||
site_ = r.table("sites").get(site_id).run()
|
||||
reql = r.table("sites").get(site_id)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
site_ = reql.run()
|
||||
return app.response_class(
|
||||
yaml.dump(site_, default_flow_style=False),
|
||||
mimetype='application/yaml')
|
||||
mimetype="application/yaml")
|
||||
|
||||
@app.route("/api/stats/<bucket>")
|
||||
def stats(bucket):
|
||||
stats_ = r.table("stats").get(bucket).run()
|
||||
reql = r.table("stats").get(bucket)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
stats_ = reql.run()
|
||||
return flask.jsonify(stats_)
|
||||
|
||||
@app.route("/api/jobs/<int:job_id>/sites")
|
||||
@app.route("/api/job/<int:job_id>/sites")
|
||||
@app.route("/api/jobs/<job_id>/sites")
|
||||
@app.route("/api/job/<job_id>/sites")
|
||||
def sites(job_id):
|
||||
sites_ = list(r.table("sites").get_all(job_id, index="job_id").run())
|
||||
try:
|
||||
jid = int(job_id)
|
||||
except ValueError:
|
||||
jid = job_id
|
||||
reql = r.table("sites").get_all(jid, index="job_id")
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
sites_ = list(reql.run())
|
||||
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
|
||||
for s in sites_:
|
||||
if "cookie_db" in s:
|
||||
@ -156,26 +167,40 @@ def sites(job_id):
|
||||
@app.route("/api/jobless-sites")
|
||||
def jobless_sites():
|
||||
# XXX inefficient (unindexed) query
|
||||
sites_ = list(r.table("sites").filter(~r.row.has_fields("job_id")).run())
|
||||
reql = r.table("sites").filter(~r.row.has_fields("job_id"))
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
sites_ = list(reql.run())
|
||||
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
|
||||
for s in sites_:
|
||||
if "cookie_db" in s:
|
||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||
return flask.jsonify(sites=sites_)
|
||||
|
||||
@app.route("/api/jobs/<int:job_id>")
|
||||
@app.route("/api/job/<int:job_id>")
|
||||
@app.route("/api/jobs/<job_id>")
|
||||
@app.route("/api/job/<job_id>")
|
||||
def job(job_id):
|
||||
job_ = r.table("jobs").get(job_id).run()
|
||||
try:
|
||||
jid = int(job_id)
|
||||
except ValueError:
|
||||
jid = job_id
|
||||
reql = r.table("jobs").get(jid)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
job_ = reql.run()
|
||||
return flask.jsonify(job_)
|
||||
|
||||
@app.route("/api/jobs/<int:job_id>/yaml")
|
||||
@app.route("/api/job/<int:job_id>/yaml")
|
||||
@app.route("/api/jobs/<job_id>/yaml")
|
||||
@app.route("/api/job/<job_id>/yaml")
|
||||
def job_yaml(job_id):
|
||||
job_ = r.table("jobs").get(job_id).run()
|
||||
try:
|
||||
jid = int(job_id)
|
||||
except ValueError:
|
||||
jid = job_id
|
||||
reql = r.table("jobs").get(jid)
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
job_ = reql.run()
|
||||
return app.response_class(
|
||||
yaml.dump(job_, default_flow_style=False),
|
||||
mimetype='application/yaml')
|
||||
mimetype="application/yaml")
|
||||
|
||||
@app.route("/api/workers")
|
||||
def workers():
|
||||
@ -189,7 +214,9 @@ def services():
|
||||
|
||||
@app.route("/api/jobs")
|
||||
def jobs():
|
||||
jobs_ = list(r.table("jobs").order_by(rethinkdb.desc("id")).run())
|
||||
reql = r.table("jobs").order_by(rethinkdb.desc("id"))
|
||||
logging.debug("querying rethinkdb: %s", reql)
|
||||
jobs_ = list(reql.run())
|
||||
return flask.jsonify(jobs=jobs_)
|
||||
|
||||
@app.route("/api/config")
|
||||
@ -209,6 +236,12 @@ def root(path):
|
||||
try:
|
||||
import gunicorn.app.base
|
||||
from gunicorn.six import iteritems
|
||||
import gunicorn.glogging
|
||||
|
||||
class BypassGunicornLogging(gunicorn.glogging.Logger):
|
||||
def setup(self, cfg):
|
||||
self.error_log.handlers = logging.root.handlers
|
||||
self.access_log.handlers = logging.root.handlers
|
||||
|
||||
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
|
||||
def __init__(self, app, options=None):
|
||||
@ -222,21 +255,24 @@ try:
|
||||
if key in self.cfg.settings and value is not None])
|
||||
for key, value in iteritems(config):
|
||||
self.cfg.set(key.lower(), value)
|
||||
self.cfg.set("logger_class", BypassGunicornLogging)
|
||||
self.cfg.set("accesslog", "dummy-value")
|
||||
|
||||
def load(self):
|
||||
return self.application
|
||||
|
||||
def run(**options):
|
||||
logging.info('running brozzler-dashboard using gunicorn')
|
||||
logging.info("running brozzler-dashboard using gunicorn")
|
||||
GunicornBrozzlerDashboard(app, options).run()
|
||||
|
||||
except ImportError:
|
||||
def run():
|
||||
logging.info('running brozzler-dashboard using simple flask app.run')
|
||||
logging.info("running brozzler-dashboard using simple flask app.run")
|
||||
app.run()
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
import brozzler.cli
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=os.path.basename(sys.argv[0]),
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
@ -252,8 +288,10 @@ def main():
|
||||
' BROZZLER_RETHINKDB_DB rethinkdb database name '
|
||||
'(default: brozzler)\n'
|
||||
' WAYBACK_BASEURL base url for constructing wayback '
|
||||
'links (default http://localhost:8091/brozzler)'))
|
||||
'links (default http://localhost:8880/brozzler)'))
|
||||
brozzler.cli.add_common_options(arg_parser)
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
brozzler.cli.configure_logging(args)
|
||||
run()
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* brozzler/dashboard/static/js/app.js - brozzler dashboard angularjs code
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
* Copyright (C) 2014-2017 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -96,16 +96,12 @@ brozzlerControllers.controller("WorkersListController", ["$scope", "$http",
|
||||
|
||||
function statsSuccessCallback(site, bucket) {
|
||||
return function(data) {
|
||||
// console.log("site = ", site);
|
||||
// console.log("/api/stats/" + bucket + " = ", data);
|
||||
site.stats = data;
|
||||
}
|
||||
}
|
||||
|
||||
function pageCountSuccessCallback(site, job) {
|
||||
return function(data) {
|
||||
// console.log("site = ", site);
|
||||
// console.log("/api/sites/" + site.id + "/page_count = ", data);
|
||||
site.page_count = data.count;
|
||||
if (job) {
|
||||
job.page_count += data.count;
|
||||
@ -115,8 +111,6 @@ function pageCountSuccessCallback(site, job) {
|
||||
|
||||
function queuedCountSuccessCallback(site, job) {
|
||||
return function(data) {
|
||||
// console.log("site = ", site);
|
||||
// console.log("/api/sites/" + site.id + "/queued_count = ", data);
|
||||
site.queued_count = data.count;
|
||||
if (job) {
|
||||
job.queued_count += data.count;
|
||||
@ -129,41 +123,44 @@ function loadSiteStats($http, site, job) {
|
||||
$http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job));
|
||||
|
||||
// look at Warcprox-Meta to find stats bucket
|
||||
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
|
||||
var bucket = site.warcprox_meta.stats.buckets[j];
|
||||
if (typeof(bucket) == "object") {
|
||||
bucket = bucket["bucket"];
|
||||
}
|
||||
if (bucket.indexOf("seed") >= 0) {
|
||||
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
|
||||
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
|
||||
try {
|
||||
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
|
||||
var bucket = site.warcprox_meta.stats.buckets[j];
|
||||
if (typeof(bucket) == "object") {
|
||||
bucket = bucket["bucket"];
|
||||
}
|
||||
if (bucket.indexOf("seed") >= 0) {
|
||||
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
// no stats bucket for this site
|
||||
}
|
||||
}
|
||||
|
||||
brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http",
|
||||
function($scope, $routeParams, $http) {
|
||||
$scope.show_yaml = false;
|
||||
// console.log('JobController');
|
||||
$http.get("/api/config").success(function(data) {
|
||||
$scope.config = data.config;
|
||||
});
|
||||
$http.get("/api/jobs/" + $routeParams.id).success(function(data) {
|
||||
$scope.job = data;
|
||||
$scope.job.page_count = $scope.job.queued_count = 0;
|
||||
// console.log("job=", $scope.job);
|
||||
var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0];
|
||||
if (typeof(bucket) == "object") {
|
||||
bucket = bucket["bucket"];
|
||||
try {
|
||||
var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0];
|
||||
if (typeof(bucket) == "object") {
|
||||
bucket = bucket["bucket"];
|
||||
}
|
||||
$http.get("/api/stats/" + bucket).success(function(data) {
|
||||
$scope.job.stats = data;
|
||||
});
|
||||
} catch (e) {
|
||||
// no stats bucket for this job
|
||||
}
|
||||
$http.get("/api/stats/" + bucket).success(function(data) {
|
||||
$scope.job.stats = data;
|
||||
// console.log("job stats=", $scope.job.stats);
|
||||
});
|
||||
|
||||
$http.get("/api/jobs/" + $routeParams.id + "/sites").success(function(data) {
|
||||
$scope.sites = data.sites;
|
||||
// console.log("sites=", $scope.sites);
|
||||
for (var i = 0; i < $scope.sites.length; i++) {
|
||||
loadSiteStats($http, $scope.sites[i], $scope.job);
|
||||
}
|
||||
@ -180,7 +177,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
|
||||
$scope.loading = false;
|
||||
$scope.pages = [];
|
||||
$window.addEventListener("scroll", function() {
|
||||
// console.log("window.scrollTop=" + window.scrollTop + " window.offsetHeight=" + window.offsetHeight + " window.scrollHeight=" + window.scrollHeight);
|
||||
if ($window.innerHeight + $window.scrollY + 50 >= window.document.documentElement.scrollHeight) {
|
||||
loadMorePages();
|
||||
}
|
||||
@ -191,10 +187,8 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
|
||||
return;
|
||||
$scope.loading = true;
|
||||
|
||||
// console.log("load more! start=" + start);
|
||||
$http.get("/api/site/" + $routeParams.id + "/pages?start=" + start + "&end=" + (start+90)).then(function(response) {
|
||||
$scope.pages = $scope.pages.concat(response.data.pages);
|
||||
// console.log("pages = ", $scope.pages);
|
||||
start += response.data.pages.length;
|
||||
$scope.loading = false;
|
||||
}, function(reason) {
|
||||
@ -209,7 +203,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht
|
||||
$http.get("/api/site/" + $routeParams.id).success(function(data) {
|
||||
$scope.site = data;
|
||||
loadSiteStats($http, $scope.site);
|
||||
// console.log("site = ", $scope.site);
|
||||
});
|
||||
$http.get("/api/site/" + $routeParams.id + "/yaml").success(function(data) {
|
||||
$scope.site_yaml = data;
|
||||
|
@ -48,21 +48,14 @@ import socketserver
|
||||
|
||||
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
arg_parser = argparse.ArgumentParser(
|
||||
prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
description=(
|
||||
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
|
||||
prog=prog, description=(
|
||||
'brozzler-easy - easy deployment of brozzler, with '
|
||||
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
|
||||
'running in a single process'))
|
||||
|
||||
# common args
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-servers', dest='rethinkdb_servers',
|
||||
default='localhost', help=(
|
||||
'rethinkdb servers, e.g. '
|
||||
'db0.foo.org,db0.foo.org:38015,db1.foo.org'))
|
||||
arg_parser.add_argument(
|
||||
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler',
|
||||
help='rethinkdb database name')
|
||||
brozzler.cli.add_rethinkdb_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
||||
help='where to write warcs')
|
||||
@ -114,18 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
type=int, default=8881, help='brozzler dashboard port')
|
||||
|
||||
# common at the bottom args
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='verbose', action='store_true',
|
||||
help='verbose logging')
|
||||
arg_parser.add_argument(
|
||||
'-q', '--quiet', dest='quiet', action='store_true',
|
||||
help='quiet logging (warnings and errors only)')
|
||||
# arg_parser.add_argument(
|
||||
# '-s', '--silent', dest='log_level', action='store_const',
|
||||
# default=logging.INFO, const=logging.CRITICAL)
|
||||
arg_parser.add_argument(
|
||||
'--version', action='version',
|
||||
version='brozzler %s - %s' % (brozzler.__version__, prog))
|
||||
brozzler.cli.add_common_options(arg_parser)
|
||||
|
||||
return arg_parser
|
||||
|
||||
@ -284,17 +266,7 @@ class BrozzlerEasyController:
|
||||
def main():
|
||||
arg_parser = _build_arg_parser()
|
||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||
if args.verbose:
|
||||
loglevel = logging.DEBUG
|
||||
elif args.quiet:
|
||||
loglevel = logging.WARNING
|
||||
else:
|
||||
loglevel = logging.INFO
|
||||
|
||||
logging.basicConfig(
|
||||
level=loglevel, stream=sys.stderr, format=(
|
||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
||||
brozzler.cli.configure_logging(args)
|
||||
|
||||
controller = BrozzlerEasyController(args)
|
||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||
|
4
setup.py
4
setup.py
@ -2,7 +2,7 @@
|
||||
'''
|
||||
setup.py - brozzler setup script
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2014-2017 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev169',
|
||||
version='1.1b9.dev174',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -29,7 +29,7 @@ import json
|
||||
|
||||
args = argparse.Namespace()
|
||||
args.log_level = logging.INFO
|
||||
brozzler.cli._configure_logging(args)
|
||||
brozzler.cli.configure_logging(args)
|
||||
|
||||
WARCPROX_META_420 = {
|
||||
'stats': {
|
||||
|
Loading…
x
Reference in New Issue
Block a user