diff --git a/.travis.yml b/.travis.yml index bbdbadf..a1d20bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,3 +17,5 @@ after_failure: notifications: slack: secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs= + secure: jopAXO8j3AkNWhF02GIzlkHJmqcCfrDEDPHcLHwxGB1vKrJqfMtcmV1+JXv7jGPwT8hBkkZItD1fTbsA1UMTtZCsadhqwrH9sh/BtJy4mf1jDDK0Hq4bPdbpB/mHKBfjD+ZedPZphCiwRQm94QdMOAsmCsj1BluFn+ySHuNAnwyXCNohut5a3aFBszOwBNgZMwBmu+weAUpMrDbr/dhqOtU0IaNvhTJ2Ykyex7Of86L05lBI8MiGtq/J73uDiDINWViBXqG5+/LKIVLvnjzCxZOnOVtSVorRNY0OsClfLJILuWOXk0/C3p+lBCyq5iatWweNqcqqpMifUSdVp4x8GnPyvl4O5YuIZW674mpGmH6UW10MqEnqxFQIcZpArir/zToK/cIKsUse20n8U5LUgOSWeNM1RIBvc4ckeDuthjwvyfmP0hrnNxrPFxRez2J2r6alWFABvD0H83a3hn56AtGXqV+9gt9d4J0+vnBJkXMidQaORBnyRkPlTROxqkoK8r0PME8xr6GwDWHpUN7/Ibo9gS/zpA7zpJUIsAsevVKOSaITZwKqbCMTI3uy/tJcnzRUrnq5wqhh8vXlWzIxEvTW8vuIapjSvDzhnJga85bIEmoauyMd13gR/vhqXQ3xUdN5LeyXAPn24b5e2GNSrhDOaAs30tXe+Z31njSeKPM= + diff --git a/brozzler/browser.py b/brozzler/browser.py index d31663a..b043c34 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -157,7 +157,10 @@ class WebsockReceiverThread(threading.Thread): brozzler.thread_raise(self.calling_thread, BrowsingException) def run(self): - self.websock.run_forever() + # ping_timeout is used as the timeout for the call to select.select() + # in addition to its documented purpose, and must have a value to avoid + # hangs in certain situations + self.websock.run_forever(ping_timeout=0.5) def _on_message(self, websock, message): try: diff --git a/brozzler/cli.py b/brozzler/cli.py index 8724e0f..987dea6 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -2,7 +2,7 @@ ''' brozzler/cli.py - brozzler command line executables -Copyright (C) 2014-2016 Internet Archive +Copyright (C) 2014-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -38,16 +38,19 @@ import yaml import shutil import base64 -def _add_common_options(arg_parser): +def add_common_options(arg_parser): arg_parser.add_argument( - '-q', '--quiet', dest='log_level', - action='store_const', default=logging.INFO, const=logging.WARN) + '-q', '--quiet', dest='log_level', action='store_const', + default=logging.INFO, const=logging.WARN, help=( + 'quiet logging, only warnings and errors')) arg_parser.add_argument( - '-v', '--verbose', dest='log_level', - action='store_const', default=logging.INFO, const=logging.DEBUG) + '-v', '--verbose', dest='log_level', action='store_const', + default=logging.INFO, const=logging.DEBUG, help=( + 'verbose logging')) arg_parser.add_argument( - '--trace', dest='log_level', - action='store_const', default=logging.INFO, const=brozzler.TRACE) + '--trace', dest='log_level', action='store_const', + default=logging.INFO, const=brozzler.TRACE, help=( + 'very verbose logging')) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', # default=logging.INFO, const=logging.CRITICAL) @@ -56,20 +59,23 @@ def _add_common_options(arg_parser): version='brozzler %s - %s' % ( brozzler.__version__, os.path.basename(sys.argv[0]))) -def _add_rethinkdb_options(arg_parser): +def add_rethinkdb_options(arg_parser): arg_parser.add_argument( - '--rethinkdb-servers', dest='rethinkdb_servers', help=( + '--rethinkdb-servers', dest='rethinkdb_servers', + default=os.environ.get('BROZZLER_RETHINKDB_SERVERS', 'localhost'), + help=( 'rethinkdb servers, e.g. ' - 'db0.foo.org,db0.foo.org:38015,db1.foo.org (takes precedence ' - 'over environment variable BROZZLER_RETHINKDB_SERVERS)')) + 'db0.foo.org,db0.foo.org:38015,db1.foo.org (default is the ' + 'value of environment variable BROZZLER_RETHINKDB_SERVERS)')) arg_parser.add_argument( - '--rethinkdb-db', dest='rethinkdb_db', help=( - 'rethinkdb database name (takes precedence over ' - 'environment variable BROZZLER_RETHINKDB_DB)')) + '--rethinkdb-db', dest='rethinkdb_db', + default=os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'), + help=( + 'rethinkdb database name (default is the value of environment ' + 'variable BROZZLER_RETHINKDB_DB)')) def rethinker(args): - servers = args.rethinkdb_servers or os.environ.get( - 'BROZZLER_RETHINKDB_SERVERS') or 'localhost' + servers = args.rethinkdb_servers or 'localhost' db = args.rethinkdb_db or os.environ.get( 'BROZZLER_RETHINKDB_DB') or 'brozzler' return rethinkstuff.Rethinker(servers.split(','), db) @@ -83,7 +89,7 @@ def _add_proxy_options(arg_parser): 'enable special features that assume the configured proxy is ' 'warcprox')) -def _configure_logging(args): +def configure_logging(args): logging.basicConfig( stream=sys.stderr, level=args.log_level, format=( @@ -115,6 +121,18 @@ def suggest_default_chrome_exe(): return exe return 'chromium-browser' +class BetterArgumentDefaultsHelpFormatter( + argparse.ArgumentDefaultsHelpFormatter): + ''' + Like argparse.ArgumentDefaultsHelpFormatter but omits the default value + for arguments with action='store_const'. + ''' + def _get_help_string(self, action): + if isinstance(action, argparse._StoreConstAction): + return action.help + else: + return super()._get_help_string(action) + def brozzle_page(): ''' Command line utility entry point for brozzling a single page. Opens url in @@ -123,7 +141,7 @@ def brozzle_page(): arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzle-page - brozzle a single page', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) + formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', @@ -149,10 +167,10 @@ def brozzle_page(): action='store_true', help=( 'enable special features that assume the configured proxy ' 'is warcprox')) - _add_common_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) behavior_parameters = {} if args.behavior_parameters: @@ -199,11 +217,11 @@ def brozzler_new_job(): arg_parser.add_argument( 'job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml') - _add_rethinkdb_options(arg_parser) - _add_common_options(arg_parser) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) r = rethinker(args) frontier = brozzler.RethinkDbFrontier(r) @@ -225,7 +243,7 @@ def brozzler_new_site(): description='brozzler-new-site - register site to brozzle', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') - _add_rethinkdb_options(arg_parser) + add_rethinkdb_options(arg_parser) _add_proxy_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, @@ -251,10 +269,10 @@ def brozzler_new_site(): arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') - _add_common_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) site = brozzler.Site( seed=args.seed, proxy=args.proxy, @@ -279,7 +297,7 @@ def brozzler_worker(): arg_parser = argparse.ArgumentParser( prog=os.path.basename(__file__), formatter_class=argparse.ArgumentDefaultsHelpFormatter) - _add_rethinkdb_options(arg_parser) + add_rethinkdb_options(arg_parser) arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), @@ -287,10 +305,10 @@ def brozzler_worker(): arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') - _add_common_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) def sigterm(signum, frame): raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)') @@ -344,11 +362,11 @@ def brozzler_ensure_tables(): arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter) - _add_rethinkdb_options(arg_parser) - _add_common_options(arg_parser) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) r = rethinker(args) @@ -374,11 +392,11 @@ def brozzler_list_jobs(): arg_parser.add_argument( '-a', '--all', dest='all', action='store_true', help=( 'list all jobs (by default, only active jobs are listed)')) - _add_rethinkdb_options(arg_parser) - _add_common_options(arg_parser) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) r = rethinker(args) reql = r.table('jobs').order_by('id') @@ -403,11 +421,11 @@ def brozzler_list_sites(): group.add_argument( '--job', dest='job', metavar='JOB_ID', help=( 'list only sites for the supplied job')) - _add_rethinkdb_options(arg_parser) - _add_common_options(arg_parser) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) r = rethinker(args) @@ -449,11 +467,11 @@ def brozzler_list_pages(): '--claimed', dest='claimed', action='store_true', help=( 'limit only pages that are currently claimed by a brozzler ' 'worker')) - _add_rethinkdb_options(arg_parser) - _add_common_options(arg_parser) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) r = rethinker(args) if args.job: @@ -508,14 +526,14 @@ def brozzler_list_captures(): 'use prefix match for url (n.b. may not work as expected if ' 'searching key has query string because canonicalization can ' 'reorder query parameters)')) - _add_rethinkdb_options(arg_parser) - _add_common_options(arg_parser) + add_rethinkdb_options(arg_parser) + add_common_options(arg_parser) arg_parser.add_argument( 'url_or_sha1', metavar='URL_or_SHA1', help='url or sha1 to look up in captures table') args = arg_parser.parse_args(args=sys.argv[1:]) - _configure_logging(args) + configure_logging(args) r = rethinker(args) diff --git a/brozzler/dashboard/__init__.py b/brozzler/dashboard/__init__.py index a1adc9b..977c0e3 100644 --- a/brozzler/dashboard/__init__.py +++ b/brozzler/dashboard/__init__.py @@ -2,7 +2,7 @@ brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api endspoints etc -Copyright (C) 2014-2016 Internet Archive +Copyright (C) 2014-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -35,27 +35,15 @@ import rethinkdb import yaml import base64 -# flask does its own logging config -# logging.basicConfig( -# stream=sys.stdout, level=logging.INFO, -# format=( -# "%(asctime)s %(process)d %(levelname)s %(threadName)s " -# "%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s") - app = flask.Flask(__name__) -# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn -gunicorn_error_logger = logging.getLogger('gunicorn.error') -app.logger.handlers.extend(gunicorn_error_logger.handlers) -app.logger.setLevel(logging.INFO) - # configure with environment variables SETTINGS = { 'RETHINKDB_SERVERS': os.environ.get( 'BROZZLER_RETHINKDB_SERVERS', 'localhost').split(','), 'RETHINKDB_DB': os.environ.get('BROZZLER_RETHINKDB_DB', 'brozzler'), 'WAYBACK_BASEURL': os.environ.get( - 'WAYBACK_BASEURL', 'http://localhost:8091/brozzler'), + 'WAYBACK_BASEURL', 'http://localhost:8880/brozzler'), } r = rethinkstuff.Rethinker( SETTINGS['RETHINKDB_SERVERS'], db=SETTINGS['RETHINKDB_DB']) @@ -69,20 +57,24 @@ def service_registry(): @app.route("/api/sites//queued_count") @app.route("/api/site//queued_count") def queued_count(site_id): - count = r.table("pages").between( + reql = r.table("pages").between( [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], - index="priority_by_site").count().run() + index="priority_by_site").count() + logging.debug("querying rethinkdb: %s", reql) + count = reql.run() return flask.jsonify(count=count) @app.route("/api/sites//queue") @app.route("/api/site//queue") def queue(site_id): - app.logger.info("flask.request.args=%s", flask.request.args) + logging.debug("flask.request.args=%s", flask.request.args) start = flask.request.args.get("start", 0) end = flask.request.args.get("end", start + 90) - queue_ = r.table("pages").between( + reql = r.table("pages").between( [site_id, 0, False, r.minval], [site_id, 0, False, r.maxval], - index="priority_by_site")[start:end].run() + index="priority_by_site")[start:end] + logging.debug("querying rethinkdb: %s", reql) + queue_ = reql.run() return flask.jsonify(queue_=list(queue_)) @app.route("/api/sites//pages_count") @@ -90,42 +82,51 @@ def queue(site_id): @app.route("/api/sites//page_count") @app.route("/api/site//page_count") def page_count(site_id): - count = r.table("pages").between( + reql = r.table("pages").between( [site_id, 1, False, r.minval], [site_id, r.maxval, False, r.maxval], - index="priority_by_site").count().run() + index="priority_by_site").count() + logging.debug("querying rethinkdb: %s", reql) + count = reql.run() return flask.jsonify(count=count) @app.route("/api/sites//pages") @app.route("/api/site//pages") def pages(site_id): """Pages already crawled.""" - app.logger.info("flask.request.args=%s", flask.request.args) start = int(flask.request.args.get("start", 0)) end = int(flask.request.args.get("end", start + 90)) - pages_ = r.table("pages").between( + reql = r.table("pages").between( [site_id, 1, r.minval], [site_id, r.maxval, r.maxval], - index="least_hops").order_by(index="least_hops")[start:end].run() + index="least_hops").order_by(index="least_hops")[start:end] + logging.debug("querying rethinkdb: %s", reql) + pages_ = reql.run() return flask.jsonify(pages=list(pages_)) @app.route("/api/pages/") @app.route("/api/page/") def page(page_id): - page_ = r.table("pages").get(page_id).run() + reql = r.table("pages").get(page_id) + logging.debug("querying rethinkdb: %s", reql) + page_ = reql.run() return flask.jsonify(page_) @app.route("/api/pages//yaml") @app.route("/api/page//yaml") def page_yaml(page_id): - page_ = r.table("pages").get(page_id).run() + reql = r.table("pages").get(page_id) + logging.debug("querying rethinkdb: %s", reql) + page_ = reql.run() return app.response_class( yaml.dump(page_, default_flow_style=False), - mimetype='application/yaml') + mimetype="application/yaml") @app.route("/api/sites/") @app.route("/api/site/") def site(site_id): - s = r.table("sites").get(site_id).run() + reql = r.table("sites").get(site_id) + logging.debug("querying rethinkdb: %s", reql) + s = reql.run() if "cookie_db" in s: s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") return flask.jsonify(s) @@ -133,20 +134,30 @@ def site(site_id): @app.route("/api/sites//yaml") @app.route("/api/site//yaml") def site_yaml(site_id): - site_ = r.table("sites").get(site_id).run() + reql = r.table("sites").get(site_id) + logging.debug("querying rethinkdb: %s", reql) + site_ = reql.run() return app.response_class( yaml.dump(site_, default_flow_style=False), - mimetype='application/yaml') + mimetype="application/yaml") @app.route("/api/stats/") def stats(bucket): - stats_ = r.table("stats").get(bucket).run() + reql = r.table("stats").get(bucket) + logging.debug("querying rethinkdb: %s", reql) + stats_ = reql.run() return flask.jsonify(stats_) -@app.route("/api/jobs//sites") -@app.route("/api/job//sites") +@app.route("/api/jobs//sites") +@app.route("/api/job//sites") def sites(job_id): - sites_ = list(r.table("sites").get_all(job_id, index="job_id").run()) + try: + jid = int(job_id) + except ValueError: + jid = job_id + reql = r.table("sites").get_all(jid, index="job_id") + logging.debug("querying rethinkdb: %s", reql) + sites_ = list(reql.run()) # TypeError: is not JSON serializable for s in sites_: if "cookie_db" in s: @@ -156,26 +167,40 @@ def sites(job_id): @app.route("/api/jobless-sites") def jobless_sites(): # XXX inefficient (unindexed) query - sites_ = list(r.table("sites").filter(~r.row.has_fields("job_id")).run()) + reql = r.table("sites").filter(~r.row.has_fields("job_id")) + logging.debug("querying rethinkdb: %s", reql) + sites_ = list(reql.run()) # TypeError: is not JSON serializable for s in sites_: if "cookie_db" in s: s["cookie_db"] = base64.b64encode(s["cookie_db"]).decode("ascii") return flask.jsonify(sites=sites_) -@app.route("/api/jobs/") -@app.route("/api/job/") +@app.route("/api/jobs/") +@app.route("/api/job/") def job(job_id): - job_ = r.table("jobs").get(job_id).run() + try: + jid = int(job_id) + except ValueError: + jid = job_id + reql = r.table("jobs").get(jid) + logging.debug("querying rethinkdb: %s", reql) + job_ = reql.run() return flask.jsonify(job_) -@app.route("/api/jobs//yaml") -@app.route("/api/job//yaml") +@app.route("/api/jobs//yaml") +@app.route("/api/job//yaml") def job_yaml(job_id): - job_ = r.table("jobs").get(job_id).run() + try: + jid = int(job_id) + except ValueError: + jid = job_id + reql = r.table("jobs").get(jid) + logging.debug("querying rethinkdb: %s", reql) + job_ = reql.run() return app.response_class( yaml.dump(job_, default_flow_style=False), - mimetype='application/yaml') + mimetype="application/yaml") @app.route("/api/workers") def workers(): @@ -189,7 +214,9 @@ def services(): @app.route("/api/jobs") def jobs(): - jobs_ = list(r.table("jobs").order_by(rethinkdb.desc("id")).run()) + reql = r.table("jobs").order_by(rethinkdb.desc("id")) + logging.debug("querying rethinkdb: %s", reql) + jobs_ = list(reql.run()) return flask.jsonify(jobs=jobs_) @app.route("/api/config") @@ -209,6 +236,12 @@ def root(path): try: import gunicorn.app.base from gunicorn.six import iteritems + import gunicorn.glogging + + class BypassGunicornLogging(gunicorn.glogging.Logger): + def setup(self, cfg): + self.error_log.handlers = logging.root.handlers + self.access_log.handlers = logging.root.handlers class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication): def __init__(self, app, options=None): @@ -222,21 +255,24 @@ try: if key in self.cfg.settings and value is not None]) for key, value in iteritems(config): self.cfg.set(key.lower(), value) + self.cfg.set("logger_class", BypassGunicornLogging) + self.cfg.set("accesslog", "dummy-value") def load(self): return self.application def run(**options): - logging.info('running brozzler-dashboard using gunicorn') + logging.info("running brozzler-dashboard using gunicorn") GunicornBrozzlerDashboard(app, options).run() except ImportError: def run(): - logging.info('running brozzler-dashboard using simple flask app.run') + logging.info("running brozzler-dashboard using simple flask app.run") app.run() def main(): import argparse + import brozzler.cli arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.RawDescriptionHelpFormatter, @@ -252,8 +288,10 @@ def main(): ' BROZZLER_RETHINKDB_DB rethinkdb database name ' '(default: brozzler)\n' ' WAYBACK_BASEURL base url for constructing wayback ' - 'links (default http://localhost:8091/brozzler)')) + 'links (default http://localhost:8880/brozzler)')) + brozzler.cli.add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) + brozzler.cli.configure_logging(args) run() if __name__ == "__main__": diff --git a/brozzler/dashboard/static/js/app.js b/brozzler/dashboard/static/js/app.js index f9d8795..6274bba 100644 --- a/brozzler/dashboard/static/js/app.js +++ b/brozzler/dashboard/static/js/app.js @@ -1,7 +1,7 @@ /* * brozzler/dashboard/static/js/app.js - brozzler dashboard angularjs code * - * Copyright (C) 2014-2016 Internet Archive + * Copyright (C) 2014-2017 Internet Archive * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,16 +96,12 @@ brozzlerControllers.controller("WorkersListController", ["$scope", "$http", function statsSuccessCallback(site, bucket) { return function(data) { - // console.log("site = ", site); - // console.log("/api/stats/" + bucket + " = ", data); site.stats = data; } } function pageCountSuccessCallback(site, job) { return function(data) { - // console.log("site = ", site); - // console.log("/api/sites/" + site.id + "/page_count = ", data); site.page_count = data.count; if (job) { job.page_count += data.count; @@ -115,8 +111,6 @@ function pageCountSuccessCallback(site, job) { function queuedCountSuccessCallback(site, job) { return function(data) { - // console.log("site = ", site); - // console.log("/api/sites/" + site.id + "/queued_count = ", data); site.queued_count = data.count; if (job) { job.queued_count += data.count; @@ -129,41 +123,44 @@ function loadSiteStats($http, site, job) { $http.get("/api/sites/" + site.id + "/queued_count").success(queuedCountSuccessCallback(site, job)); // look at Warcprox-Meta to find stats bucket - for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) { - var bucket = site.warcprox_meta.stats.buckets[j]; - if (typeof(bucket) == "object") { - bucket = bucket["bucket"]; - } - if (bucket.indexOf("seed") >= 0) { - // console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket); - $http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket)); + try { + for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) { + var bucket = site.warcprox_meta.stats.buckets[j]; + if (typeof(bucket) == "object") { + bucket = bucket["bucket"]; + } + if (bucket.indexOf("seed") >= 0) { + $http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket)); + } } + } catch (e) { + // no stats bucket for this site } } brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$http", function($scope, $routeParams, $http) { $scope.show_yaml = false; - // console.log('JobController'); $http.get("/api/config").success(function(data) { $scope.config = data.config; }); $http.get("/api/jobs/" + $routeParams.id).success(function(data) { $scope.job = data; $scope.job.page_count = $scope.job.queued_count = 0; - // console.log("job=", $scope.job); - var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0]; - if (typeof(bucket) == "object") { - bucket = bucket["bucket"]; + try { + var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0]; + if (typeof(bucket) == "object") { + bucket = bucket["bucket"]; + } + $http.get("/api/stats/" + bucket).success(function(data) { + $scope.job.stats = data; + }); + } catch (e) { + // no stats bucket for this job } - $http.get("/api/stats/" + bucket).success(function(data) { - $scope.job.stats = data; - // console.log("job stats=", $scope.job.stats); - }); $http.get("/api/jobs/" + $routeParams.id + "/sites").success(function(data) { $scope.sites = data.sites; - // console.log("sites=", $scope.sites); for (var i = 0; i < $scope.sites.length; i++) { loadSiteStats($http, $scope.sites[i], $scope.job); } @@ -180,7 +177,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht $scope.loading = false; $scope.pages = []; $window.addEventListener("scroll", function() { - // console.log("window.scrollTop=" + window.scrollTop + " window.offsetHeight=" + window.offsetHeight + " window.scrollHeight=" + window.scrollHeight); if ($window.innerHeight + $window.scrollY + 50 >= window.document.documentElement.scrollHeight) { loadMorePages(); } @@ -191,10 +187,8 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht return; $scope.loading = true; - // console.log("load more! start=" + start); $http.get("/api/site/" + $routeParams.id + "/pages?start=" + start + "&end=" + (start+90)).then(function(response) { $scope.pages = $scope.pages.concat(response.data.pages); - // console.log("pages = ", $scope.pages); start += response.data.pages.length; $scope.loading = false; }, function(reason) { @@ -209,7 +203,6 @@ brozzlerControllers.controller("SiteController", ["$scope", "$routeParams", "$ht $http.get("/api/site/" + $routeParams.id).success(function(data) { $scope.site = data; loadSiteStats($http, $scope.site); - // console.log("site = ", $scope.site); }); $http.get("/api/site/" + $routeParams.id + "/yaml").success(function(data) { $scope.site_yaml = data; diff --git a/brozzler/easy.py b/brozzler/easy.py index 17ae268..899c15a 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -48,21 +48,14 @@ import socketserver def _build_arg_parser(prog=os.path.basename(sys.argv[0])): arg_parser = argparse.ArgumentParser( - prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description=( + formatter_class=brozzler.cli.BetterArgumentDefaultsHelpFormatter, + prog=prog, description=( 'brozzler-easy - easy deployment of brozzler, with ' 'brozzler-worker, warcprox, pywb, and brozzler-dashboard all ' 'running in a single process')) # common args - arg_parser.add_argument( - '--rethinkdb-servers', dest='rethinkdb_servers', - default='localhost', help=( - 'rethinkdb servers, e.g. ' - 'db0.foo.org,db0.foo.org:38015,db1.foo.org')) - arg_parser.add_argument( - '--rethinkdb-db', dest='rethinkdb_db', default='brozzler', - help='rethinkdb database name') + brozzler.cli.add_rethinkdb_options(arg_parser) arg_parser.add_argument( '-d', '--warcs-dir', dest='warcs_dir', default='./warcs', help='where to write warcs') @@ -114,18 +107,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])): type=int, default=8881, help='brozzler dashboard port') # common at the bottom args - arg_parser.add_argument( - '-v', '--verbose', dest='verbose', action='store_true', - help='verbose logging') - arg_parser.add_argument( - '-q', '--quiet', dest='quiet', action='store_true', - help='quiet logging (warnings and errors only)') - # arg_parser.add_argument( - # '-s', '--silent', dest='log_level', action='store_const', - # default=logging.INFO, const=logging.CRITICAL) - arg_parser.add_argument( - '--version', action='version', - version='brozzler %s - %s' % (brozzler.__version__, prog)) + brozzler.cli.add_common_options(arg_parser) return arg_parser @@ -284,17 +266,7 @@ class BrozzlerEasyController: def main(): arg_parser = _build_arg_parser() args = arg_parser.parse_args(args=sys.argv[1:]) - if args.verbose: - loglevel = logging.DEBUG - elif args.quiet: - loglevel = logging.WARNING - else: - loglevel = logging.INFO - - logging.basicConfig( - level=loglevel, stream=sys.stderr, format=( - '%(asctime)s %(process)d %(levelname)s %(threadName)s ' - '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')) + brozzler.cli.configure_logging(args) controller = BrozzlerEasyController(args) signal.signal(signal.SIGTERM, lambda a,b: controller.stop.set()) diff --git a/setup.py b/setup.py index 9b57433..a71986f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ ''' setup.py - brozzler setup script -Copyright (C) 2014-2016 Internet Archive +Copyright (C) 2014-2017 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev169', + version='1.1b9.dev174', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 5d66db9..a5a1e86 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -29,7 +29,7 @@ import json args = argparse.Namespace() args.log_level = logging.INFO -brozzler.cli._configure_logging(args) +brozzler.cli.configure_logging(args) WARCPROX_META_420 = { 'stats': {