mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
improve brozzler-dashboard logging; fix default wayback baseurl in brozzler dashboard (https://github.com/internetarchive/brozzler/issues/31); tweak arg parsing related stuff
This commit is contained in:
parent
095456aa27
commit
c3b637d244
108
brozzler/cli.py
108
brozzler/cli.py
@ -2,7 +2,7 @@
|
|||||||
'''
|
'''
|
||||||
brozzler/cli.py - brozzler command line executables
|
brozzler/cli.py - brozzler command line executables
|
||||||
|
|
||||||
Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2017 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -38,16 +38,19 @@ import yaml
|
|||||||
import shutil
|
import shutil
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
def _add_common_options(arg_parser):
|
def add_common_options(arg_parser):
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-q', '--quiet', dest='log_level',
|
'-q', '--quiet', dest='log_level', action='store_const',
|
||||||
action='store_const', default=logging.INFO, const=logging.WARN)
|
default=logging.INFO, const=logging.WARN, help=(
|
||||||
|
'quiet logging, only warnings and errors'))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-v', '--verbose', dest='log_level',
|
'-v', '--verbose', dest='log_level', action='store_const',
|
||||||
action='store_const', default=logging.INFO, const=logging.DEBUG)
|
default=logging.INFO, const=logging.DEBUG, help=(
|
||||||
|
'verbose logging'))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--trace', dest='log_level',
|
'--trace', dest='log_level', action='store_const',
|
||||||
action='store_const', default=logging.INFO, const=brozzler.TRACE)
|
default=logging.INFO, const=brozzler.TRACE, help=(
|
||||||
|
'very verbose logging'))
|
||||||
# arg_parser.add_argument(
|
# arg_parser.add_argument(
|
||||||
# '-s', '--silent', dest='log_level', action='store_const',
|
# '-s', '--silent', dest='log_level', action='store_const',
|
||||||
# default=logging.INFO, const=logging.CRITICAL)
|
# default=logging.INFO, const=logging.CRITICAL)
|
||||||
@ -56,20 +59,23 @@ def _add_common_options(arg_parser):
|
|||||||
version='brozzler %s - %s' % (
|
version='brozzler %s - %s' % (
|
||||||
brozzler.__version__, os.path.basename(sys.argv[0])))
|
brozzler.__version__, os.path.basename(sys.argv[0])))
|
||||||
|
|
||||||
def _add_rethinkdb_options(arg_parser):
|
def add_rethinkdb_options(arg_parser):
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--rethinkdb-servers', dest='rethinkdb_servers', help=(
|
'--rethinkdb-servers', dest='rethinkdb_servers',
|
||||||
|
default=os.environ.get('BROZZLER_RETHINKDB_SERVERS', 'localhost'),
|
||||||
|
help=(
|
||||||
'rethinkdb servers, e.g. '
|
'rethinkdb servers, e.g. '
|
||||||
'db0.foo.org,db0.foo.org:38015,db1.foo.org (takes precedence '
|
'db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the '
|
||||||
'over environment variable BROZZLER_RETHINKDB_SERVERS)'))
|
'value of environment variable BROZZLER_RETHINKDB_SERVERS)'))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--rethinkdb-db', dest='rethinkdb_db', help=(
|
'--rethinkdb-db', dest='rethinkdb_db',
|
||||||
'rethinkdb database name (takes precedence over '
|
default=os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
|
||||||
'environment variable BROZZLER_RETHINKDB_DB)'))
|
help=(
|
||||||
|
'rethinkdb database name (default is the value of environment '
|
||||||
|
'variable BROZZLER_RETHINKDB_DB)'))
|
||||||
|
|
||||||
def rethinker(args):
|
def rethinker(args):
|
||||||
servers = args.rethinkdb_servers or os.environ.get(
|
servers = args.rethinkdb_servers or 'localhost'
|
||||||
'BROZZLER_RETHINKDB_SERVERS') or 'localhost'
|
|
||||||
db = args.rethinkdb_db or os.environ.get(
|
db = args.rethinkdb_db or os.environ.get(
|
||||||
'BROZZLER_RETHINKDB_DB') or 'brozzler'
|
'BROZZLER_RETHINKDB_DB') or 'brozzler'
|
||||||
return rethinkstuff.Rethinker(servers.split(','), db)
|
return rethinkstuff.Rethinker(servers.split(','), db)
|
||||||
@ -83,7 +89,7 @@ def _add_proxy_options(arg_parser):
|
|||||||
'enable special features that assume the configured proxy is '
|
'enable special features that assume the configured proxy is '
|
||||||
'warcprox'))
|
'warcprox'))
|
||||||
|
|
||||||
def _configure_logging(args):
|
def configure_logging(args):
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
stream=sys.stderr, level=args.log_level,
|
stream=sys.stderr, level=args.log_level,
|
||||||
format=(
|
format=(
|
||||||
@ -115,6 +121,18 @@ def suggest_default_chrome_exe():
|
|||||||
return exe
|
return exe
|
||||||
return 'chromium-browser'
|
return 'chromium-browser'
|
||||||
|
|
||||||
|
class BetterArgumentDefaultsHelpFormatter(
|
||||||
|
argparse.ArgumentDefaultsHelpFormatter):
|
||||||
|
'''
|
||||||
|
Like argparse.ArgumentDefaultsHelpFormatter but omits the default value
|
||||||
|
for arguments with action='store_const'.
|
||||||
|
'''
|
||||||
|
def _get_help_string(self, action):
|
||||||
|
if isinstance(action, argparse._StoreConstAction):
|
||||||
|
return action.help
|
||||||
|
else:
|
||||||
|
return super()._get_help_string(action)
|
||||||
|
|
||||||
def brozzle_page():
|
def brozzle_page():
|
||||||
'''
|
'''
|
||||||
Command line utility entry point for brozzling a single page. Opens url in
|
Command line utility entry point for brozzling a single page. Opens url in
|
||||||
@ -123,7 +141,7 @@ def brozzle_page():
|
|||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
description='brozzle-page - brozzle a single page',
|
description='brozzle-page - brozzle a single page',
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=BetterArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('url', metavar='URL', help='page url')
|
arg_parser.add_argument('url', metavar='URL', help='page url')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
'-e', '--chrome-exe', dest='chrome_exe',
|
||||||
@ -149,10 +167,10 @@ def brozzle_page():
|
|||||||
action='store_true', help=(
|
action='store_true', help=(
|
||||||
'enable special features that assume the configured proxy '
|
'enable special features that assume the configured proxy '
|
||||||
'is warcprox'))
|
'is warcprox'))
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
behavior_parameters = {}
|
behavior_parameters = {}
|
||||||
if args.behavior_parameters:
|
if args.behavior_parameters:
|
||||||
@ -199,11 +217,11 @@ def brozzler_new_job():
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
'job_conf_file', metavar='JOB_CONF_FILE',
|
||||||
help='brozzler job configuration file in yaml')
|
help='brozzler job configuration file in yaml')
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
r = rethinker(args)
|
r = rethinker(args)
|
||||||
frontier = brozzler.RethinkDbFrontier(r)
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
@ -225,7 +243,7 @@ def brozzler_new_site():
|
|||||||
description='brozzler-new-site - register site to brozzle',
|
description='brozzler-new-site - register site to brozzle',
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_proxy_options(arg_parser)
|
_add_proxy_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--time-limit', dest='time_limit', default=None,
|
'--time-limit', dest='time_limit', default=None,
|
||||||
@ -251,10 +269,10 @@ def brozzler_new_site():
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--password', dest='password', default=None,
|
'--password', dest='password', default=None,
|
||||||
help='use this password to try to log in if a login form is found')
|
help='use this password to try to log in if a login form is found')
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
seed=args.seed, proxy=args.proxy,
|
seed=args.seed, proxy=args.proxy,
|
||||||
@ -279,7 +297,7 @@ def brozzler_worker():
|
|||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(__file__),
|
prog=os.path.basename(__file__),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
'-e', '--chrome-exe', dest='chrome_exe',
|
||||||
default=suggest_default_chrome_exe(),
|
default=suggest_default_chrome_exe(),
|
||||||
@ -287,10 +305,10 @@ def brozzler_worker():
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-n', '--max-browsers', dest='max_browsers', default='1',
|
'-n', '--max-browsers', dest='max_browsers', default='1',
|
||||||
help='max number of chrome instances simultaneously browsing pages')
|
help='max number of chrome instances simultaneously browsing pages')
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
def sigterm(signum, frame):
|
def sigterm(signum, frame):
|
||||||
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
|
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
|
||||||
@ -344,11 +362,11 @@ def brozzler_ensure_tables():
|
|||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
r = rethinker(args)
|
r = rethinker(args)
|
||||||
|
|
||||||
@ -374,11 +392,11 @@ def brozzler_list_jobs():
|
|||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-a', '--all', dest='all', action='store_true', help=(
|
'-a', '--all', dest='all', action='store_true', help=(
|
||||||
'list all jobs (by default, only active jobs are listed)'))
|
'list all jobs (by default, only active jobs are listed)'))
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
r = rethinker(args)
|
r = rethinker(args)
|
||||||
reql = r.table('jobs').order_by('id')
|
reql = r.table('jobs').order_by('id')
|
||||||
@ -403,11 +421,11 @@ def brozzler_list_sites():
|
|||||||
group.add_argument(
|
group.add_argument(
|
||||||
'--job', dest='job', metavar='JOB_ID', help=(
|
'--job', dest='job', metavar='JOB_ID', help=(
|
||||||
'list only sites for the supplied job'))
|
'list only sites for the supplied job'))
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
r = rethinker(args)
|
r = rethinker(args)
|
||||||
|
|
||||||
@ -449,11 +467,11 @@ def brozzler_list_pages():
|
|||||||
'--claimed', dest='claimed', action='store_true', help=(
|
'--claimed', dest='claimed', action='store_true', help=(
|
||||||
'limit only pages that are currently claimed by a brozzler '
|
'limit only pages that are currently claimed by a brozzler '
|
||||||
'worker'))
|
'worker'))
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
r = rethinker(args)
|
r = rethinker(args)
|
||||||
if args.job:
|
if args.job:
|
||||||
@ -508,14 +526,14 @@ def brozzler_list_captures():
|
|||||||
'use prefix match for url (n.b. may not work as expected if '
|
'use prefix match for url (n.b. may not work as expected if '
|
||||||
'searching key has query string because canonicalization can '
|
'searching key has query string because canonicalization can '
|
||||||
'reorder query parameters)'))
|
'reorder query parameters)'))
|
||||||
_add_rethinkdb_options(arg_parser)
|
add_rethinkdb_options(arg_parser)
|
||||||
_add_common_options(arg_parser)
|
add_common_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'url_or_sha1', metavar='URL_or_SHA1',
|
'url_or_sha1', metavar='URL_or_SHA1',
|
||||||
help='url or sha1 to look up in captures table')
|
help='url or sha1 to look up in captures table')
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
_configure_logging(args)
|
configure_logging(args)
|
||||||
|
|
||||||
r = rethinker(args)
|
r = rethinker(args)
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
|
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
|
||||||
endspoints etc
|
endspoints etc
|
||||||
|
|
||||||
Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2014-2017 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -35,27 +35,15 @@ import rethinkdb
|
|||||||
import yaml
|
import yaml
|
||||||
import base64
|
import base64
|
||||||
|
|
||||||
# flask does its own logging config
|
|
||||||
# logging.basicConfig(
|
|
||||||
# stream=sys.stdout, level=logging.INFO,
|
|
||||||
# format=(
|
|
||||||
# "%(asctime)s %(process)d %(levelname)s %(threadName)s "
|
|
||||||
# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
|
|
||||||
|
|
||||||
app = flask.Flask(__name__)
|
app = flask.Flask(__name__)
|
||||||
|
|
||||||
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
|
|
||||||
gunicorn_error_logger = logging.getLogger('gunicorn.error')
|
|
||||||
app.logger.handlers.extend(gunicorn_error_logger.handlers)
|
|
||||||
app.logger.setLevel(logging.INFO)
|
|
||||||
|
|
||||||
# configure with environment variables
|
# configure with environment variables
|
||||||
SETTINGS = {
|
SETTINGS = {
|
||||||
'RETHINKDB_SERVERS': os.environ.get(
|
'RETHINKDB_SERVERS': os.environ.get(
|
||||||
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
|
'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','),
|
||||||
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
|
'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'),
|
||||||
'WAYBACK_BASEURL': os.environ.get(
|
'WAYBACK_BASEURL': os.environ.get(
|
||||||
'WAYBACK_BASEURL', 'http://localhost:8091/brozzler'),
|
'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'),
|
||||||
}
|
}
|
||||||
r = rethinkstuff.Rethinker(
|
r = rethinkstuff.Rethinker(
|
||||||
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
|
SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB'])
|
||||||
@ -69,20 +57,24 @@ def service_registry():
|
|||||||
@app.route("/api/sites/<site_id>/queued_count")
|
@app.route("/api/sites/<site_id>/queued_count")
|
||||||
@app.route("/api/site/<site_id>/queued_count")
|
@app.route("/api/site/<site_id>/queued_count")
|
||||||
def queued_count(site_id):
|
def queued_count(site_id):
|
||||||
count = r.table("pages").between(
|
reql = r.table("pages").between(
|
||||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
||||||
index="priority_by_site").count().run()
|
index="priority_by_site").count()
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
count = reql.run()
|
||||||
return flask.jsonify(count=count)
|
return flask.jsonify(count=count)
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/queue")
|
@app.route("/api/sites/<site_id>/queue")
|
||||||
@app.route("/api/site/<site_id>/queue")
|
@app.route("/api/site/<site_id>/queue")
|
||||||
def queue(site_id):
|
def queue(site_id):
|
||||||
app.logger.info("flask.request.args=%s", flask.request.args)
|
logging.debug("flask.request.args=%s", flask.request.args)
|
||||||
start = flask.request.args.get("start", 0)
|
start = flask.request.args.get("start", 0)
|
||||||
end = flask.request.args.get("end", start + 90)
|
end = flask.request.args.get("end", start + 90)
|
||||||
queue_ = r.table("pages").between(
|
reql = r.table("pages").between(
|
||||||
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
[site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],
|
||||||
index="priority_by_site")[start:end].run()
|
index="priority_by_site")[start:end]
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
queue_ = reql.run()
|
||||||
return flask.jsonify(queue_=list(queue_))
|
return flask.jsonify(queue_=list(queue_))
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/pages_count")
|
@app.route("/api/sites/<site_id>/pages_count")
|
||||||
@ -90,42 +82,51 @@ def queue(site_id):
|
|||||||
@app.route("/api/sites/<site_id>/page_count")
|
@app.route("/api/sites/<site_id>/page_count")
|
||||||
@app.route("/api/site/<site_id>/page_count")
|
@app.route("/api/site/<site_id>/page_count")
|
||||||
def page_count(site_id):
|
def page_count(site_id):
|
||||||
count = r.table("pages").between(
|
reql = r.table("pages").between(
|
||||||
[site_id, 1, False, r.minval],
|
[site_id, 1, False, r.minval],
|
||||||
[site_id, r.maxval, False, r.maxval],
|
[site_id, r.maxval, False, r.maxval],
|
||||||
index="priority_by_site").count().run()
|
index="priority_by_site").count()
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
count = reql.run()
|
||||||
return flask.jsonify(count=count)
|
return flask.jsonify(count=count)
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>/pages")
|
@app.route("/api/sites/<site_id>/pages")
|
||||||
@app.route("/api/site/<site_id>/pages")
|
@app.route("/api/site/<site_id>/pages")
|
||||||
def pages(site_id):
|
def pages(site_id):
|
||||||
"""Pages already crawled."""
|
"""Pages already crawled."""
|
||||||
app.logger.info("flask.request.args=%s", flask.request.args)
|
|
||||||
start = int(flask.request.args.get("start", 0))
|
start = int(flask.request.args.get("start", 0))
|
||||||
end = int(flask.request.args.get("end", start + 90))
|
end = int(flask.request.args.get("end", start + 90))
|
||||||
pages_ = r.table("pages").between(
|
reql = r.table("pages").between(
|
||||||
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
[site_id, 1, r.minval], [site_id, r.maxval, r.maxval],
|
||||||
index="least_hops").order_by(index="least_hops")[start:end].run()
|
index="least_hops").order_by(index="least_hops")[start:end]
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
pages_ = reql.run()
|
||||||
return flask.jsonify(pages=list(pages_))
|
return flask.jsonify(pages=list(pages_))
|
||||||
|
|
||||||
@app.route("/api/pages/<page_id>")
|
@app.route("/api/pages/<page_id>")
|
||||||
@app.route("/api/page/<page_id>")
|
@app.route("/api/page/<page_id>")
|
||||||
def page(page_id):
|
def page(page_id):
|
||||||
page_ = r.table("pages").get(page_id).run()
|
reql = r.table("pages").get(page_id)
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
page_ = reql.run()
|
||||||
return flask.jsonify(page_)
|
return flask.jsonify(page_)
|
||||||
|
|
||||||
@app.route("/api/pages/<page_id>/yaml")
|
@app.route("/api/pages/<page_id>/yaml")
|
||||||
@app.route("/api/page/<page_id>/yaml")
|
@app.route("/api/page/<page_id>/yaml")
|
||||||
def page_yaml(page_id):
|
def page_yaml(page_id):
|
||||||
page_ = r.table("pages").get(page_id).run()
|
reql = r.table("pages").get(page_id)
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
page_ = reql.run()
|
||||||
return app.response_class(
|
return app.response_class(
|
||||||
yaml.dump(page_, default_flow_style=False),
|
yaml.dump(page_, default_flow_style=False),
|
||||||
mimetype='application/yaml')
|
mimetype="application/yaml")
|
||||||
|
|
||||||
@app.route("/api/sites/<site_id>")
|
@app.route("/api/sites/<site_id>")
|
||||||
@app.route("/api/site/<site_id>")
|
@app.route("/api/site/<site_id>")
|
||||||
def site(site_id):
|
def site(site_id):
|
||||||
s = r.table("sites").get(site_id).run()
|
reql = r.table("sites").get(site_id)
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
s = reql.run()
|
||||||
if "cookie_db" in s:
|
if "cookie_db" in s:
|
||||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||||
return flask.jsonify(s)
|
return flask.jsonify(s)
|
||||||
@ -133,20 +134,30 @@ def site(site_id):
|
|||||||
@app.route("/api/sites/<site_id>/yaml")
|
@app.route("/api/sites/<site_id>/yaml")
|
||||||
@app.route("/api/site/<site_id>/yaml")
|
@app.route("/api/site/<site_id>/yaml")
|
||||||
def site_yaml(site_id):
|
def site_yaml(site_id):
|
||||||
site_ = r.table("sites").get(site_id).run()
|
reql = r.table("sites").get(site_id)
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
site_ = reql.run()
|
||||||
return app.response_class(
|
return app.response_class(
|
||||||
yaml.dump(site_, default_flow_style=False),
|
yaml.dump(site_, default_flow_style=False),
|
||||||
mimetype='application/yaml')
|
mimetype="application/yaml")
|
||||||
|
|
||||||
@app.route("/api/stats/<bucket>")
|
@app.route("/api/stats/<bucket>")
|
||||||
def stats(bucket):
|
def stats(bucket):
|
||||||
stats_ = r.table("stats").get(bucket).run()
|
reql = r.table("stats").get(bucket)
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
stats_ = reql.run()
|
||||||
return flask.jsonify(stats_)
|
return flask.jsonify(stats_)
|
||||||
|
|
||||||
@app.route("/api/jobs/<int:job_id>/sites")
|
@app.route("/api/jobs/<job_id>/sites")
|
||||||
@app.route("/api/job/<int:job_id>/sites")
|
@app.route("/api/job/<job_id>/sites")
|
||||||
def sites(job_id):
|
def sites(job_id):
|
||||||
sites_ = list(r.table("sites").get_all(job_id, index="job_id").run())
|
try:
|
||||||
|
jid = int(job_id)
|
||||||
|
except ValueError:
|
||||||
|
jid = job_id
|
||||||
|
reql = r.table("sites").get_all(jid, index="job_id")
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
sites_ = list(reql.run())
|
||||||
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
|
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
|
||||||
for s in sites_:
|
for s in sites_:
|
||||||
if "cookie_db" in s:
|
if "cookie_db" in s:
|
||||||
@ -156,26 +167,40 @@ def sites(job_id):
|
|||||||
@app.route("/api/jobless-sites")
|
@app.route("/api/jobless-sites")
|
||||||
def jobless_sites():
|
def jobless_sites():
|
||||||
# XXX inefficient (unindexed) query
|
# XXX inefficient (unindexed) query
|
||||||
sites_ = list(r.table("sites").filter(~r.row.has_fields("job_id")).run())
|
reql = r.table("sites").filter(~r.row.has_fields("job_id"))
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
sites_ = list(reql.run())
|
||||||
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
|
# TypeError: <binary, 7168 bytes, '53 51 4c 69 74 65...'> is not JSON serializable
|
||||||
for s in sites_:
|
for s in sites_:
|
||||||
if "cookie_db" in s:
|
if "cookie_db" in s:
|
||||||
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii")
|
||||||
return flask.jsonify(sites=sites_)
|
return flask.jsonify(sites=sites_)
|
||||||
|
|
||||||
@app.route("/api/jobs/<int:job_id>")
|
@app.route("/api/jobs/<job_id>")
|
||||||
@app.route("/api/job/<int:job_id>")
|
@app.route("/api/job/<job_id>")
|
||||||
def job(job_id):
|
def job(job_id):
|
||||||
job_ = r.table("jobs").get(job_id).run()
|
try:
|
||||||
|
jid = int(job_id)
|
||||||
|
except ValueError:
|
||||||
|
jid = job_id
|
||||||
|
reql = r.table("jobs").get(jid)
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
job_ = reql.run()
|
||||||
return flask.jsonify(job_)
|
return flask.jsonify(job_)
|
||||||
|
|
||||||
@app.route("/api/jobs/<int:job_id>/yaml")
|
@app.route("/api/jobs/<job_id>/yaml")
|
||||||
@app.route("/api/job/<int:job_id>/yaml")
|
@app.route("/api/job/<job_id>/yaml")
|
||||||
def job_yaml(job_id):
|
def job_yaml(job_id):
|
||||||
job_ = r.table("jobs").get(job_id).run()
|
try:
|
||||||
|
jid = int(job_id)
|
||||||
|
except ValueError:
|
||||||
|
jid = job_id
|
||||||
|
reql = r.table("jobs").get(jid)
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
job_ = reql.run()
|
||||||
return app.response_class(
|
return app.response_class(
|
||||||
yaml.dump(job_, default_flow_style=False),
|
yaml.dump(job_, default_flow_style=False),
|
||||||
mimetype='application/yaml')
|
mimetype="application/yaml")
|
||||||
|
|
||||||
@app.route("/api/workers")
|
@app.route("/api/workers")
|
||||||
def workers():
|
def workers():
|
||||||
@ -189,7 +214,9 @@ def services():
|
|||||||
|
|
||||||
@app.route("/api/jobs")
|
@app.route("/api/jobs")
|
||||||
def jobs():
|
def jobs():
|
||||||
jobs_ = list(r.table("jobs").order_by(rethinkdb.desc("id")).run())
|
reql = r.table("jobs").order_by(rethinkdb.desc("id"))
|
||||||
|
logging.debug("querying rethinkdb: %s", reql)
|
||||||
|
jobs_ = list(reql.run())
|
||||||
return flask.jsonify(jobs=jobs_)
|
return flask.jsonify(jobs=jobs_)
|
||||||
|
|
||||||
@app.route("/api/config")
|
@app.route("/api/config")
|
||||||
@ -209,6 +236,12 @@ def root(path):
|
|||||||
try:
|
try:
|
||||||
import gunicorn.app.base
|
import gunicorn.app.base
|
||||||
from gunicorn.six import iteritems
|
from gunicorn.six import iteritems
|
||||||
|
import gunicorn.glogging
|
||||||
|
|
||||||
|
class BypassGunicornLogging(gunicorn.glogging.Logger):
|
||||||
|
def setup(self, cfg):
|
||||||
|
self.error_log.handlers = logging.root.handlers
|
||||||
|
self.access_log.handlers = logging.root.handlers
|
||||||
|
|
||||||
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
|
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
|
||||||
def __init__(self, app, options=None):
|
def __init__(self, app, options=None):
|
||||||
@ -222,21 +255,24 @@ try:
|
|||||||
if key in self.cfg.settings and value is not None])
|
if key in self.cfg.settings and value is not None])
|
||||||
for key, value in iteritems(config):
|
for key, value in iteritems(config):
|
||||||
self.cfg.set(key.lower(), value)
|
self.cfg.set(key.lower(), value)
|
||||||
|
self.cfg.set("logger_class", BypassGunicornLogging)
|
||||||
|
self.cfg.set("accesslog", "dummy-value")
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
return self.application
|
return self.application
|
||||||
|
|
||||||
def run(**options):
|
def run(**options):
|
||||||
logging.info('running brozzler-dashboard using gunicorn')
|
logging.info("running brozzler-dashboard using gunicorn")
|
||||||
GunicornBrozzlerDashboard(app, options).run()
|
GunicornBrozzlerDashboard(app, options).run()
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
def run():
|
def run():
|
||||||
logging.info('running brozzler-dashboard using simple flask app.run')
|
logging.info("running brozzler-dashboard using simple flask app.run")
|
||||||
app.run()
|
app.run()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
import argparse
|
import argparse
|
||||||
|
import brozzler.cli
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=os.path.basename(sys.argv[0]),
|
prog=os.path.basename(sys.argv[0]),
|
||||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
@ -252,8 +288,10 @@ def main():
|
|||||||
' BROZZLER_RETHINKDB_DB rethinkdb database name '
|
' BROZZLER_RETHINKDB_DB rethinkdb database name '
|
||||||
'(default: brozzler)\n'
|
'(default: brozzler)\n'
|
||||||
' WAYBACK_BASEURL base url for constructing wayback '
|
' WAYBACK_BASEURL base url for constructing wayback '
|
||||||
'links (default http://localhost:8091/brozzler)'))
|
'links (default http://localhost:8880/brozzler)'))
|
||||||
|
brozzler.cli.add_common_options(arg_parser)
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
|
brozzler.cli.configure_logging(args)
|
||||||
run()
|
run()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -48,21 +48,14 @@ import socketserver
|
|||||||
|
|
||||||
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||||
arg_parser = argparse.ArgumentParser(
|
arg_parser = argparse.ArgumentParser(
|
||||||
prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter,
|
||||||
description=(
|
prog=prog, description=(
|
||||||
'brozzler-easy - easy deployment of brozzler, with '
|
'brozzler-easy - easy deployment of brozzler, with '
|
||||||
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
|
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
|
||||||
'running in a single process'))
|
'running in a single process'))
|
||||||
|
|
||||||
# common args
|
# common args
|
||||||
arg_parser.add_argument(
|
brozzler.cli.add_rethinkdb_options(arg_parser)
|
||||||
'--rethinkdb-servers', dest='rethinkdb_servers',
|
|
||||||
default='localhost', help=(
|
|
||||||
'rethinkdb servers, e.g. '
|
|
||||||
'db0.foo.org,db0.foo.org:38015,db1.foo.org'))
|
|
||||||
arg_parser.add_argument(
|
|
||||||
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler',
|
|
||||||
help='rethinkdb database name')
|
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
'-d', '--warcs-dir', dest='warcs_dir', default='./warcs',
|
||||||
help='where to write warcs')
|
help='where to write warcs')
|
||||||
@ -114,18 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
|||||||
type=int, default=8881, help='brozzler dashboard port')
|
type=int, default=8881, help='brozzler dashboard port')
|
||||||
|
|
||||||
# common at the bottom args
|
# common at the bottom args
|
||||||
arg_parser.add_argument(
|
brozzler.cli.add_common_options(arg_parser)
|
||||||
'-v', '--verbose', dest='verbose', action='store_true',
|
|
||||||
help='verbose logging')
|
|
||||||
arg_parser.add_argument(
|
|
||||||
'-q', '--quiet', dest='quiet', action='store_true',
|
|
||||||
help='quiet logging (warnings and errors only)')
|
|
||||||
# arg_parser.add_argument(
|
|
||||||
# '-s', '--silent', dest='log_level', action='store_const',
|
|
||||||
# default=logging.INFO, const=logging.CRITICAL)
|
|
||||||
arg_parser.add_argument(
|
|
||||||
'--version', action='version',
|
|
||||||
version='brozzler %s - %s' % (brozzler.__version__, prog))
|
|
||||||
|
|
||||||
return arg_parser
|
return arg_parser
|
||||||
|
|
||||||
@ -284,17 +266,7 @@ class BrozzlerEasyController:
|
|||||||
def main():
|
def main():
|
||||||
arg_parser = _build_arg_parser()
|
arg_parser = _build_arg_parser()
|
||||||
args = arg_parser.parse_args(args=sys.argv[1:])
|
args = arg_parser.parse_args(args=sys.argv[1:])
|
||||||
if args.verbose:
|
brozzler.cli.configure_logging(args)
|
||||||
loglevel = logging.DEBUG
|
|
||||||
elif args.quiet:
|
|
||||||
loglevel = logging.WARNING
|
|
||||||
else:
|
|
||||||
loglevel = logging.INFO
|
|
||||||
|
|
||||||
logging.basicConfig(
|
|
||||||
level=loglevel, stream=sys.stderr, format=(
|
|
||||||
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
|
|
||||||
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
|
|
||||||
|
|
||||||
controller = BrozzlerEasyController(args)
|
controller = BrozzlerEasyController(args)
|
||||||
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set())
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev171',
|
version='1.1b9.dev172',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user