improve brozzler-dashboard logging; fix default wayback baseurl in brozzler dashboard (https://github.com/internetarchive/brozzler/issues/31); tweak arg parsing related stuff

This commit is contained in:
Noah Levitt 2017-01-20 23:41:59 -08:00
parent 095456aa27
commit c3b637d244
4 changed files with 153 additions and 125 deletions

View File

@ -2,7 +2,7 @@
'''
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -38,16 +38,19 @@ import yaml
import shutil
import base64
def _add_common_options(arg_parser):
def add_common_options(arg_parser):
arg_parser.add_argument(
'-q', '--quiet', dest='log_level',
action='store_const', default=logging.INFO, const=logging.WARN)
'-q', '--quiet', dest='log_level', action='store_const',
default=logging.INFO, const=logging.WARN, help=(
'quiet logging, only warnings and errors'))
arg_parser.add_argument(
'-v', '--verbose', dest='log_level',
action='store_const', default=logging.INFO, const=logging.DEBUG)
'-v', '--verbose', dest='log_level', action='store_const',
default=logging.INFO, const=logging.DEBUG, help=(
'verbose logging'))
arg_parser.add_argument(
'--trace', dest='log_level',
action='store_const', default=logging.INFO, const=brozzler.TRACE)
'--trace', dest='log_level', action='store_const',
default=logging.INFO, const=brozzler.TRACE, help=(
'very verbose logging'))
# arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const',
# default=logging.INFO, const=logging.CRITICAL)
@ -56,20 +59,23 @@ def _add_common_options(arg_parser):
version='brozzler %s - %s' % (
brozzler.__version__, os.path.basename(sys.argv[0])))
def _add_rethinkdb_options(arg_parser):
def add_rethinkdb_options(arg_parser):
arg_parser.add_argument(
'--rethinkdb-servers', dest='rethinkdb_servers', help=(
'--rethinkdb-servers', dest='rethinkdb_servers',
default=os.environ.get('BROZZLER_RETHINKDB_SERVERS', 'localhost'),
help=(
'rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org (takes precedence '
'over environment variable BROZZLER_RETHINKDB_SERVERS)'))
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the '
'value of environment variable BROZZLER_RETHINKDB_SERVERS)'))
arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', help=(
'rethinkdb database name (takes precedence over '
'environment variable BROZZLER_RETHINKDB_DB)'))
'--rethinkdb-db', dest='rethinkdb_db',
default=os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
help=(
'rethinkdb database name (default is the value of environment '
'variable BROZZLER_RETHINKDB_DB)'))
def rethinker(args):
servers = args.rethinkdb_servers or os.environ.get(
'BROZZLER_RETHINKDB_SERVERS') or 'localhost'
servers = args.rethinkdb_servers or 'localhost'
db = args.rethinkdb_db or os.environ.get(
'BROZZLER_RETHINKDB_DB') or 'brozzler'
return rethinkstuff.Rethinker(servers.split(','), db)
@ -83,7 +89,7 @@ def _add_proxy_options(arg_parser):
'enable special features that assume the configured proxy is '
'warcprox'))
def _configure_logging(args):
def configure_logging(args):
logging.basicConfig(
stream=sys.stderr, level=args.log_level,
format=(
@ -115,6 +121,18 @@ def suggest_default_chrome_exe():
return exe
return 'chromium-browser'
class BetterArgumentDefaultsHelpFormatter(
argparse.ArgumentDefaultsHelpFormatter):
'''
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
for arguments with action='store_const'.
'''
def _get_help_string(self, action):
if isinstance(action, argparse._StoreConstAction):
return action.help
else:
return super()._get_help_string(action)
def brozzle_page():
'''
Command line utility entry point for brozzling a single page. Opens url in
@ -123,7 +141,7 @@ def brozzle_page():
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
description='brozzle-page - brozzle a single page',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
formatter_class=BetterArgumentDefaultsHelpFormatter)
arg_parser.add_argument('url', metavar='URL', help='page url')
arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe',
@ -149,10 +167,10 @@ def brozzle_page():
action='store_true', help=(
'enable special features that assume the configured proxy '
'is warcprox'))
_add_common_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
behavior_parameters = {}
if args.behavior_parameters:
@ -199,11 +217,11 @@ def brozzler_new_job():
arg_parser.add_argument(
'job_conf_file', metavar='JOB_CONF_FILE',
help='brozzler job configuration file in yaml')
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinker(args)
frontier = brozzler.RethinkDbFrontier(r)
@ -225,7 +243,7 @@ def brozzler_new_site():
description='brozzler-new-site - register site to brozzle',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
_add_rethinkdb_options(arg_parser)
add_rethinkdb_options(arg_parser)
_add_proxy_options(arg_parser)
arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None,
@ -251,10 +269,10 @@ def brozzler_new_site():
arg_parser.add_argument(
'--password', dest='password', default=None,
help='use this password to try to log in if a login form is found')
_add_common_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
site = brozzler.Site(
seed=args.seed, proxy=args.proxy,
@ -279,7 +297,7 @@ def brozzler_worker():
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser)
add_rethinkdb_options(arg_parser)
arg_parser.add_argument(
'-e', '--chrome-exe', dest='chrome_exe',
default=suggest_default_chrome_exe(),
@ -287,10 +305,10 @@ def brozzler_worker():
arg_parser.add_argument(
'-n', '--max-browsers', dest='max_browsers', default='1',
help='max number of chrome instances simultaneously browsing pages')
_add_common_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
def sigterm(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
@ -344,11 +362,11 @@ def brozzler_ensure_tables():
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinker(args)
@ -374,11 +392,11 @@ def brozzler_list_jobs():
arg_parser.add_argument(
'-a', '--all', dest='all', action='store_true', help=(
'list all jobs (by default, only active jobs are listed)'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinker(args)
reql = r.table('jobs').order_by('id')
@ -403,11 +421,11 @@ def brozzler_list_sites():
group.add_argument(
'--job', dest='job', metavar='JOB_ID', help=(
'list only sites for the supplied job'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinker(args)
@ -449,11 +467,11 @@ def brozzler_list_pages():
'--claimed', dest='claimed', action='store_true', help=(
'limit only pages that are currently claimed by a brozzler '
'worker'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinker(args)
if args.job:
@ -508,14 +526,14 @@ def brozzler_list_captures():
'use prefix match for url (n.b. may not work as expected if '
'searching key has query string because canonicalization can '
'reorder query parameters)'))
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
add_rethinkdb_options(arg_parser)
add_common_options(arg_parser)
arg_parser.add_argument(
'url_or_sha1', metavar='URL_or_SHA1',
help='url or sha1 to look up in captures table')
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
configure_logging(args)
r = rethinker(args)

View File

@ -2,7 +2,7 @@
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
endspoints etc
Copyright (C) 2014-2016 Internet Archive
Copyright (C) 2014-2017 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -35,27 +35,15 @@ import rethinkdb
import yaml
import base64
# flask does its own logging config
# logging.basicConfig(
# stream=sys.stdout, level=logging.INFO,
# format=(
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
app = flask.Flask(__name__)
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
gunicorn_error_logger = logging.getLogger('gunicorn.error')
app.logger.handlers.extend(gunicorn_error_logger.handlers)
app.logger.setLevel(logging.INFO)
# configure with environment variables
SETTINGS = {
'RETHINKDB_SERVERS': os.environ.get(
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
'WAYBACK_BASEURL': os.environ.get(
'WAYBACK_BASEURL', 'http://localhost:8091/brozzler'),
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
}
r = rethinkstuff.Rethinker(
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
@ -69,20 +57,24 @@ def service_registry():
@app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count")
def queued_count(site_id):
count = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site").count().run()
index="priority_by_site").count()
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/queue")
@app.route("/api/site/<site_id>/queue")
def queue(site_id):
app.logger.info("flask.request.args=%s", flask.request.args)
logging.debug("flask.request.args=%s", flask.request.args)
start = flask.request.args.get("start", 0)
end = flask.request.args.get("end", start + 90)
queue_ = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
index="priority_by_site")[start:end].run()
index="priority_by_site")[start:end]
logging.debug("querying rethinkdb: %s", reql)
queue_ = reql.run()
return flask.jsonify(queue_=list(queue_))
@app.route("/api/sites/<site_id>/pages_count")
@ -90,42 +82,51 @@ def queue(site_id):
@app.route("/api/sites/<site_id>/page_count")
@app.route("/api/site/<site_id>/page_count")
def page_count(site_id):
count = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 1, False, r.minval],
[site_id, r.maxval, False, r.maxval],
index="priority_by_site").count().run()
index="priority_by_site").count()
logging.debug("querying rethinkdb: %s", reql)
count = reql.run()
return flask.jsonify(count=count)
@app.route("/api/sites/<site_id>/pages")
@app.route("/api/site/<site_id>/pages")
def pages(site_id):
"""Pages already crawled."""
app.logger.info("flask.request.args=%s", flask.request.args)
start = int(flask.request.args.get("start", 0))
end = int(flask.request.args.get("end", start + 90))
pages_ = r.table("pages").between(
reql = r.table("pages").between(
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
index="least_hops").order_by(index="least_hops")[start:end].run()
index="least_hops").order_by(index="least_hops")[start:end]
logging.debug("querying rethinkdb: %s", reql)
pages_ = reql.run()
return flask.jsonify(pages=list(pages_))
@app.route("/api/pages/<page_id>")
@app.route("/api/page/<page_id>")
def page(page_id):
page_ = r.table("pages").get(page_id).run()
reql = r.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return flask.jsonify(page_)
@app.route("/api/pages/<page_id>/yaml")
@app.route("/api/page/<page_id>/yaml")
def page_yaml(page_id):
page_ = r.table("pages").get(page_id).run()
reql = r.table("pages").get(page_id)
logging.debug("querying rethinkdb: %s", reql)
page_ = reql.run()
return app.response_class(
yaml.dump(page_, default_flow_style=False),
mimetype='application/yaml')
mimetype="application/yaml")
@app.route("/api/sites/<site_id>")
@app.route("/api/site/<site_id>")
def site(site_id):
s = r.table("sites").get(site_id).run()
reql = r.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
s = reql.run()
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(s)
@ -133,20 +134,30 @@ def site(site_id):
@app.route("/api/sites/<site_id>/yaml")
@app.route("/api/site/<site_id>/yaml")
def site_yaml(site_id):
site_ = r.table("sites").get(site_id).run()
reql = r.table("sites").get(site_id)
logging.debug("querying rethinkdb: %s", reql)
site_ = reql.run()
return app.response_class(
yaml.dump(site_, default_flow_style=False),
mimetype='application/yaml')
mimetype="application/yaml")
@app.route("/api/stats/<bucket>")
def stats(bucket):
stats_ = r.table("stats").get(bucket).run()
reql = r.table("stats").get(bucket)
logging.debug("querying rethinkdb: %s", reql)
stats_ = reql.run()
return flask.jsonify(stats_)
@app.route("/api/jobs/<int:job_id>/sites")
@app.route("/api/job/<int:job_id>/sites")
@app.route("/api/jobs/<job_id>/sites")
@app.route("/api/job/<job_id>/sites")
def sites(job_id):
sites_ = list(r.table("sites").get_all(job_id, index="job_id").run())
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("sites").get_all(jid, index="job_id")
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
@ -156,26 +167,40 @@ def sites(job_id):
@app.route("/api/jobless-sites")
def jobless_sites():
# XXX inefficient (unindexed) query
sites_ = list(r.table("sites").filter(~r.row.has_fields("job_id")).run())
reql = r.table("sites").filter(~r.row.has_fields("job_id"))
logging.debug("querying rethinkdb: %s", reql)
sites_ = list(reql.run())
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
for s in sites_:
if "cookie_db" in s:
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
return flask.jsonify(sites=sites_)
@app.route("/api/jobs/<int:job_id>")
@app.route("/api/job/<int:job_id>")
@app.route("/api/jobs/<job_id>")
@app.route("/api/job/<job_id>")
def job(job_id):
job_ = r.table("jobs").get(job_id).run()
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return flask.jsonify(job_)
@app.route("/api/jobs/<int:job_id>/yaml")
@app.route("/api/job/<int:job_id>/yaml")
@app.route("/api/jobs/<job_id>/yaml")
@app.route("/api/job/<job_id>/yaml")
def job_yaml(job_id):
job_ = r.table("jobs").get(job_id).run()
try:
jid = int(job_id)
except ValueError:
jid = job_id
reql = r.table("jobs").get(jid)
logging.debug("querying rethinkdb: %s", reql)
job_ = reql.run()
return app.response_class(
yaml.dump(job_, default_flow_style=False),
mimetype='application/yaml')
mimetype="application/yaml")
@app.route("/api/workers")
def workers():
@ -189,7 +214,9 @@ def services():
@app.route("/api/jobs")
def jobs():
jobs_ = list(r.table("jobs").order_by(rethinkdb.desc("id")).run())
reql = r.table("jobs").order_by(rethinkdb.desc("id"))
logging.debug("querying rethinkdb: %s", reql)
jobs_ = list(reql.run())
return flask.jsonify(jobs=jobs_)
@app.route("/api/config")
@ -209,6 +236,12 @@ def root(path):
try:
import gunicorn.app.base
from gunicorn.six import iteritems
import gunicorn.glogging
class BypassGunicornLogging(gunicorn.glogging.Logger):
def setup(self, cfg):
self.error_log.handlers = logging.root.handlers
self.access_log.handlers = logging.root.handlers
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
def __init__(self, app, options=None):
@ -222,21 +255,24 @@ try:
if key in self.cfg.settings and value is not None])
for key, value in iteritems(config):
self.cfg.set(key.lower(), value)
self.cfg.set("logger_class", BypassGunicornLogging)
self.cfg.set("accesslog", "dummy-value")
def load(self):
return self.application
def run(**options):
logging.info('running brozzler-dashboard using gunicorn')
logging.info("running brozzler-dashboard using gunicorn")
GunicornBrozzlerDashboard(app, options).run()
except ImportError:
def run():
logging.info('running brozzler-dashboard using simple flask app.run')
logging.info("running brozzler-dashboard using simple flask app.run")
app.run()
def main():
import argparse
import brozzler.cli
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
@ -252,8 +288,10 @@ def main():
' BROZZLER_RETHINKDB_DB rethinkdb database name '
'(default: brozzler)\n'
' WAYBACK_BASEURL base url for constructing wayback '
'links (default http://localhost:8091/brozzler)'))
'links (default http://localhost:8880/brozzler)'))
brozzler.cli.add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
brozzler.cli.configure_logging(args)
run()
if __name__ == "__main__":

View File

@ -48,21 +48,14 @@ import socketserver
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
arg_parser = argparse.ArgumentParser(
prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=(
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
prog=prog, description=(
'brozzler-easy - easy deployment of brozzler, with '
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
'running in a single process'))
# common args
arg_parser.add_argument(
'--rethinkdb-servers', dest='rethinkdb_servers',
default='localhost', help=(
'rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org'))
arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler',
help='rethinkdb database name')
brozzler.cli.add_rethinkdb_options(arg_parser)
arg_parser.add_argument(
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
help='where to write warcs')
@ -114,18 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
type=int, default=8881, help='brozzler dashboard port')
# common at the bottom args
arg_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true',
help='verbose logging')
arg_parser.add_argument(
'-q', '--quiet', dest='quiet', action='store_true',
help='quiet logging (warnings and errors only)')
# arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const',
# default=logging.INFO, const=logging.CRITICAL)
arg_parser.add_argument(
'--version', action='version',
version='brozzler %s - %s' % (brozzler.__version__, prog))
brozzler.cli.add_common_options(arg_parser)
return arg_parser
@ -284,17 +266,7 @@ class BrozzlerEasyController:
def main():
arg_parser = _build_arg_parser()
args = arg_parser.parse_args(args=sys.argv[1:])
if args.verbose:
loglevel = logging.DEBUG
elif args.quiet:
loglevel = logging.WARNING
else:
loglevel = logging.INFO
logging.basicConfig(
level=loglevel, stream=sys.stderr, format=(
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
brozzler.cli.configure_logging(args)
controller = BrozzlerEasyController(args)
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev171',
version='1.1b9.dev172',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',