diff --git a/.gitmodules b/.gitmodules index 88c87e5..961fbe1 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ -[submodule "webconsole/brozzler-webconsole/static/noVNC"] - path = webconsole/brozzler-webconsole/static/noVNC +[submodule "brozzler/webconsole/static/noVNC"] + path = brozzler/webconsole/static/noVNC url = https://github.com/kanaka/noVNC.git diff --git a/README.rst b/README.rst index c67e620..94f3262 100644 --- a/README.rst +++ b/README.rst @@ -20,6 +20,8 @@ archiving. Installation ------------ +Brozzler requires python 3.4 or later. + :: # set up virtualenv if desired @@ -68,6 +70,26 @@ must be specified, everything else is optional. scope: surt: http://(org,example, +Brozzler Web Console +-------------------- + +Brozzler comes with a rudimentary web application for viewing crawl job status. +To install the brozzler with dependencies required to run this app, run + +:: + + pip install brozzler[webconsole] + + +To start the app, run + +:: + + brozzler-webconsole + + +XXX configuration stuff + Fonts (for decent screenshots) ------------------------------ diff --git a/bin/brozzle-page b/bin/brozzle-page deleted file mode 100755 index 27e1a35..0000000 --- a/bin/brozzle-page +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python -# -# brozzle-page - command line utility for brozzling a single page, i.e. opening -# it in a browser, running some javascript behaviors, and printing outlinks -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import os -import sys -import logging -import brozzler -import re -import warnings -import requests -import string -import datetime - -arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), - description="brozzle-page - brozzle a single page", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -arg_parser.add_argument('url', metavar='URL', help='page url') -arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', - help='executable to use to invoke chrome') -arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site") -arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features', - action='store_true', help='enable special features for this site that assume the configured proxy is warcprox') -arg_parser.add_argument("-v", "--verbose", dest="log_level", - action="store_const", default=logging.INFO, const=logging.DEBUG) -arg_parser.add_argument("--version", action="version", - version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__))) -args = arg_parser.parse_args(args=sys.argv[1:]) - -logging.basicConfig(stream=sys.stdout, level=args.log_level, - format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s") -logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) - -site = brozzler.Site( - id=-1, seed=args.url, proxy=args.proxy, - enable_warcprox_features=args.enable_warcprox_features) -page = brozzler.Page(url=args.url, site_id=site.id) -worker = brozzler.BrozzlerWorker(frontier=None) -ydl = worker._youtube_dl(site) - -def on_screenshot(screenshot_png): - OK_CHARS = (string.ascii_letters + string.digits) - filename = "/tmp/{}-{:%Y%m%d%H%M%S}.png".format( - "".join(ch if ch in OK_CHARS else "_" for ch in args.url), - datetime.datetime.now()) - # logging.info("len(screenshot_png)=%s", len(screenshot_png)) - with open(filename, 'wb') as f: - f.write(screenshot_png) - logging.info("wrote screenshot to %s", filename) - -browser = brozzler.Browser(chrome_exe=args.chrome_exe) -browser.start(proxy=site.proxy) -try: - outlinks = worker.brozzle_page( - browser, ydl, site, page, on_screenshot=on_screenshot) - logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks))) -except brozzler.ReachedLimit as e: - logging.error("reached limit %s", e) -finally: - browser.stop() diff --git a/bin/brozzler-new-job b/bin/brozzler-new-job deleted file mode 100755 index 4ddb65e..0000000 --- a/bin/brozzler-new-job +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python -# -# brozzler-new-job - takes a yaml brozzler job configuration file, creates -# job, sites, and pages objects in rethinkdb, which brozzler-workers will look -# at and start crawling -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import os -import sys -import logging -import brozzler -import yaml -import json -import rethinkstuff -import warnings -import requests - -arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), - description="brozzler-new-job - queue new job with brozzler", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml') -arg_parser.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', default="localhost", - help='rethinkdb servers, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') -arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler", - help='rethinkdb database name') -arg_parser.add_argument("-v", "--verbose", dest="log_level", - action="store_const", default=logging.INFO, const=logging.DEBUG) -arg_parser.add_argument("--version", action="version", - version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__))) -args = arg_parser.parse_args(args=sys.argv[1:]) - -logging.basicConfig(stream=sys.stdout, level=args.log_level, - format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s") -logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) - -r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) -frontier = brozzler.RethinkDbFrontier(r) -brozzler.job.new_job_file(frontier, args.job_conf_file) - diff --git a/bin/brozzler-new-site b/bin/brozzler-new-site deleted file mode 100755 index 79742d6..0000000 --- a/bin/brozzler-new-site +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python -# -# brozzler-new-site - takes a seed url and creates a site and page object in -# rethinkdb, which brozzler-workers will look at and start crawling -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -import argparse -import os -import sys -import logging -import brozzler -import re -import rethinkstuff -import warnings -import requests -import json - -arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), - description="brozzler-new-site - register site to brozzle", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -arg_parser.add_argument('seed', metavar='SEED', help='seed url') -arg_parser.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', default="localhost", - help='rethinkdb servers, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') -arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler", - help='rethinkdb database name') -arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site") -arg_parser.add_argument("--time-limit", dest="time_limit", default=None, help="time limit in seconds for this site") -arg_parser.add_argument("--ignore-robots", dest="ignore_robots", - action="store_true", help="ignore robots.txt for this site") -arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features', - action='store_true', help='enable special features for this site that assume the configured proxy is warcprox') -arg_parser.add_argument( - '--warcprox-meta', dest='warcprox_meta', - help='Warcprox-Meta http request header to send with each request; ' - 'must be a json blob, ignored unless warcprox features are enabled') -arg_parser.add_argument("-v", "--verbose", dest="log_level", - action="store_const", default=logging.INFO, const=logging.DEBUG) -arg_parser.add_argument("--version", action="version", - version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__))) -args = arg_parser.parse_args(args=sys.argv[1:]) - -logging.basicConfig(stream=sys.stdout, level=args.log_level, - format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s") -logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) - -site = brozzler.Site( - seed=args.seed, proxy=args.proxy, - time_limit=int(args.time_limit) if args.time_limit else None, - ignore_robots=args.ignore_robots, - enable_warcprox_features=args.enable_warcprox_features, - warcprox_meta=json.loads(args.warcprox_meta) if args.warcprox_meta else None) - -r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) -frontier = brozzler.RethinkDbFrontier(r) -brozzler.new_site(frontier, site) - diff --git a/bin/brozzler-worker b/bin/brozzler-worker deleted file mode 100755 index 0db2bed..0000000 --- a/bin/brozzler-worker +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python -# -# brozzler-worker - main entrypoint for brozzler, gets sites and pages to -# brozzle from rethinkdb, brozzles them -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - - -import argparse -import os -import sys -import logging -import brozzler -import brozzler.worker -import threading -import time -import signal -import pprint -import traceback -import rethinkstuff -import warnings -import requests - -arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__), - formatter_class=argparse.ArgumentDefaultsHelpFormatter) -arg_parser.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', default="localhost", - help='rethinkdb servers, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org') -arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler", - help='rethinkdb database name') -arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser', - help='executable to use to invoke chrome') -arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1', - help='max number of chrome instances simultaneously browsing pages') -arg_parser.add_argument('-v', '--verbose', dest='log_level', - action="store_const", default=logging.INFO, const=logging.DEBUG) -arg_parser.add_argument('--version', action='version', - version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__))) -args = arg_parser.parse_args(args=sys.argv[1:]) - -logging.basicConfig(stream=sys.stdout, level=args.log_level, - format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s') -logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning) -warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) - -def sigterm(signum, frame): - raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)') -def sigint(signum, frame): - raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)') - -def dump_state(signum, frame): - pp = pprint.PrettyPrinter(indent=4) - state_strs = [] - - for th in threading.enumerate(): - state_strs.append(str(th)) - stack = traceback.format_stack(sys._current_frames()[th.ident]) - state_strs.append("".join(stack)) - - logging.warn("dumping state (caught signal {})\n{}".format( - signum, "\n".join(state_strs))) - -signal.signal(signal.SIGQUIT, dump_state) -signal.signal(signal.SIGTERM, sigterm) -signal.signal(signal.SIGINT, sigint) - -r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) -frontier = brozzler.RethinkDbFrontier(r) -service_registry = rethinkstuff.ServiceRegistry(r) -worker = brozzler.worker.BrozzlerWorker( - frontier, service_registry, max_browsers=int(args.max_browsers), - chrome_exe=args.chrome_exe) - -worker_thread = worker.start() - -try: - while worker_thread.is_alive(): - time.sleep(0.5) - logging.critical("worker thread has died, shutting down") -except brozzler.ShutdownRequested as e: - pass -finally: - worker.shutdown_now() - for th in threading.enumerate(): - if th != threading.current_thread(): - th.join() - -logging.info("all done, exiting") diff --git a/brozzler.svg b/brozzler.svg index a108910..8cec68a 120000 --- a/brozzler.svg +++ b/brozzler.svg @@ -1 +1 @@ -webconsole/brozzler-webconsole/static/brozzler.svg \ No newline at end of file +brozzler/webconsole/static/brozzler.svg \ No newline at end of file diff --git a/brozzler/browser.py b/brozzler/browser.py index 79b353f..02b7f3c 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -273,7 +273,7 @@ class Browser: or self._waiting_on_screenshot_msg_id): return False - if self._outlinks: + if self._outlinks is not None: self.logger.info("got outlinks, finished browsing %s", self.url) return True elif not self._waiting_on_outlinks_msg_id: @@ -290,7 +290,9 @@ var compileOutlinks = function(frame) { var outlinks = Array.prototype.slice.call( frame.document.querySelectorAll('a[href]')); for (var i = 0; i < frame.frames.length; i++) { - outlinks = outlinks.concat(compileOutlinks(frame.frames[i])); + if (frame.frames[i]) { // sometimes undefined (why?) + outlinks = outlinks.concat(compileOutlinks(frame.frames[i])); + } } return outlinks; } diff --git a/brozzler/cli.py b/brozzler/cli.py new file mode 100644 index 0000000..b5264b4 --- /dev/null +++ b/brozzler/cli.py @@ -0,0 +1,266 @@ +#!/usr/bin/env python +''' +brozzler/cli.py - brozzler command line executables + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import argparse +import brozzler +import brozzler.worker +import datetime +import json +import logging +import os +import pprint +import re +import requests +import rethinkstuff +import signal +import string +import sys +import threading +import time +import traceback +import warnings +import yaml + +def _add_common_options(arg_parser): + arg_parser.add_argument( + '-v', '--verbose', dest='log_level', + action='store_const', default=logging.INFO, const=logging.DEBUG) + arg_parser.add_argument( + '--version', action='version', + version='brozzler %s - %s' % ( + brozzler.__version__, os.path.basename(sys.argv[0]))) + +def _add_rethinkdb_options(arg_parser): + arg_parser.add_argument( + '--rethinkdb-servers', dest='rethinkdb_servers', + default='localhost', help=( + 'rethinkdb servers, e.g. ' + 'db0.foo.org,db0.foo.org:38015,db1.foo.org')) + arg_parser.add_argument( + '--rethinkdb-db', dest='rethinkdb_db', default='brozzler', + help='rethinkdb database name') + +def _add_proxy_options(arg_parser): + arg_parser.add_argument( + '--proxy', dest='proxy', default=None, help='http proxy') + arg_parser.add_argument( + '--enable-warcprox-features', dest='enable_warcprox_features', + action='store_true', help=( + 'enable special features that assume the configured proxy is ' + 'warcprox')) + +def _configure_logging(args): + logging.basicConfig( + stream=sys.stderr, level=args.log_level, + format=( + '%(asctime)s %(process)d %(levelname)s %(threadName)s ' + '%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')) + logging.getLogger('requests.packages.urllib3').setLevel(logging.WARN) + warnings.simplefilter( + 'ignore', category=requests.packages.urllib3.exceptions.InsecureRequestWarning) + warnings.simplefilter( + 'ignore', category=requests.packages.urllib3.exceptions.InsecurePlatformWarning) + +def brozzle_page(): + ''' + Command line utility entry point for brozzling a single page. Opens url in + a browser, running some javascript behaviors, and prints outlinks. + ''' + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + description='brozzle-page - brozzle a single page', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument('url', metavar='URL', help='page url') + arg_parser.add_argument( + '-e', '--executable', dest='chrome_exe', default='chromium-browser', + help='executable to use to invoke chrome') + arg_parser.add_argument( + '--proxy', dest='proxy', default=None, + help='http proxy') + arg_parser.add_argument( + '--enable-warcprox-features', dest='enable_warcprox_features', + action='store_true', help=( + 'enable special features that assume the configured proxy ' + 'is warcprox')) + _add_common_options(arg_parser) + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + site = brozzler.Site( + id=-1, seed=args.url, proxy=args.proxy, + enable_warcprox_features=args.enable_warcprox_features) + page = brozzler.Page(url=args.url, site_id=site.id) + worker = brozzler.BrozzlerWorker(frontier=None) + ydl = worker._youtube_dl(site) + + def on_screenshot(screenshot_png): + OK_CHARS = (string.ascii_letters + string.digits) + filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( + ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), + datetime.datetime.now()) + # logging.info('len(screenshot_png)=%s', len(screenshot_png)) + with open(filename, 'wb') as f: + f.write(screenshot_png) + logging.info('wrote screenshot to %s', filename) + + browser = brozzler.Browser(chrome_exe=args.chrome_exe) + browser.start(proxy=site.proxy) + try: + outlinks = worker.brozzle_page( + browser, ydl, site, page, on_screenshot=on_screenshot) + logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) + except brozzler.ReachedLimit as e: + logging.error('reached limit %s', e) + finally: + browser.stop() + +def brozzler_new_job(): + ''' + Command line utility entry point for queuing a new brozzler job. Takes a + yaml brozzler job configuration file, creates job, sites, and pages objects + in rethinkdb, which brozzler-workers will look at and start crawling. + ''' + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + description='brozzler-new-job - queue new job with brozzler', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument( + 'job_conf_file', metavar='JOB_CONF_FILE', + help='brozzler job configuration file in yaml') + _add_rethinkdb_options(arg_parser) + _add_common_options(arg_parser) + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(','), args.rethinkdb_db) + frontier = brozzler.RethinkDbFrontier(r) + brozzler.job.new_job_file(frontier, args.job_conf_file) + + +def brozzler_new_site(): + ''' + Command line utility entry point for queuing a new brozzler site. + Takes a seed url and creates a site and page object in rethinkdb, which + brozzler-workers will look at and start crawling. + ''' + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(sys.argv[0]), + description='brozzler-new-site - register site to brozzle', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + arg_parser.add_argument('seed', metavar='SEED', help='seed url') + _add_rethinkdb_options(arg_parser) + _add_proxy_options(arg_parser) + arg_parser.add_argument( + '--time-limit', dest='time_limit', default=None, + help='time limit in seconds for this site') + arg_parser.add_argument( + '--ignore-robots', dest='ignore_robots', action='store_true', + help='ignore robots.txt for this site') + arg_parser.add_argument( + '--warcprox-meta', dest='warcprox_meta', + help=( + 'Warcprox-Meta http request header to send with each request; ' + 'must be a json blob, ignored unless warcprox features are ' + 'enabled')) + _add_common_options(arg_parser) + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + site = brozzler.Site( + seed=args.seed, proxy=args.proxy, + time_limit=int(args.time_limit) if args.time_limit else None, + ignore_robots=args.ignore_robots, + enable_warcprox_features=args.enable_warcprox_features, + warcprox_meta=( + json.loads(args.warcprox_meta) if args.warcprox_meta else None)) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(","), args.rethinkdb_db) + frontier = brozzler.RethinkDbFrontier(r) + brozzler.new_site(frontier, site) + +def brozzler_worker(): + ''' + Main entrypoint for brozzler, gets sites and pages to brozzle from + rethinkdb, brozzles them. + ''' + + arg_parser = argparse.ArgumentParser( + prog=os.path.basename(__file__), + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + _add_rethinkdb_options(arg_parser) + arg_parser.add_argument( + '-e', '--executable', dest='chrome_exe', default='chromium-browser', + help='executable to use to invoke chrome') + arg_parser.add_argument( + '-n', '--max-browsers', dest='max_browsers', default='1', + help='max number of chrome instances simultaneously browsing pages') + _add_common_options(arg_parser) + + args = arg_parser.parse_args(args=sys.argv[1:]) + _configure_logging(args) + + def sigterm(signum, frame): + raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)') + def sigint(signum, frame): + raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)') + + def dump_state(signum, frame): + pp = pprint.PrettyPrinter(indent=4) + state_strs = [] + + for th in threading.enumerate(): + state_strs.append(str(th)) + stack = traceback.format_stack(sys._current_frames()[th.ident]) + state_strs.append("".join(stack)) + + logging.warn("dumping state (caught signal {})\n{}".format( + signum, "\n".join(state_strs))) + + signal.signal(signal.SIGQUIT, dump_state) + signal.signal(signal.SIGTERM, sigterm) + signal.signal(signal.SIGINT, sigint) + + r = rethinkstuff.Rethinker( + args.rethinkdb_servers.split(","), args.rethinkdb_db) + frontier = brozzler.RethinkDbFrontier(r) + service_registry = rethinkstuff.ServiceRegistry(r) + worker = brozzler.worker.BrozzlerWorker( + frontier, service_registry, max_browsers=int(args.max_browsers), + chrome_exe=args.chrome_exe) + + worker_thread = worker.start() + + try: + while worker_thread.is_alive(): + time.sleep(0.5) + logging.critical("worker thread has died, shutting down") + except brozzler.ShutdownRequested as e: + pass + finally: + worker.shutdown_now() + for th in threading.enumerate(): + if th != threading.current_thread(): + th.join() + + logging.info("brozzler-worker is all done, exiting") diff --git a/webconsole/brozzler-webconsole/__init__.py b/brozzler/webconsole/__init__.py similarity index 80% rename from webconsole/brozzler-webconsole/__init__.py rename to brozzler/webconsole/__init__.py index cc6d90b..db0f78b 100644 --- a/webconsole/brozzler-webconsole/__init__.py +++ b/brozzler/webconsole/__init__.py @@ -1,5 +1,5 @@ ''' -brozzler-webconsole/__init__.py - flask app for brozzler web console, defines +brozzler/webconsole/__init__.py - flask app for brozzler web console, defines api endspoints etc Copyright (C) 2014-2016 Internet Archive @@ -17,14 +17,22 @@ See the License for the specific language governing permissions and limitations under the License. ''' -import flask +import logging +import sys +try: + import flask +except ImportError as e: + logging.critical( + '%s: %s\n\nYou might need to run "pip install ' + 'brozzler[webconsole]".\nSee README.rst for more information.', + type(e).__name__, e) + sys.exit(1) + import rethinkstuff import json -import sys import os import importlib import rethinkdb -import logging import yaml # flask does its own logging config @@ -157,6 +165,36 @@ def api404(path): def root(path): return flask.render_template("index.html") -if __name__ == "__main__": - app.run(host="0.0.0.0", port=8081, debug=True) +try: + import gunicorn.app.base + from gunicorn.six import iteritems + + class GunicornBrozzlerWebConsole(gunicorn.app.base.BaseApplication): + def __init__(self, app, options=None): + self.options = options or {} + self.application = app + super(GunicornBrozzlerWebConsole, self).__init__() + + def load_config(self): + config = dict( + [(key, value) for key, value in iteritems(self.options) + if key in self.cfg.settings and value is not None]) + for key, value in iteritems(config): + self.cfg.set(key.lower(), value) + + def load(self): + return self.application + + def run(**options): + logging.info('running brozzler-webconsole using gunicorn') + GunicornBrozzlerWebConsole(app, options).run() + +except ImportError: + def run(): + logging.info('running brozzler-webconsole using simple flask app.run') + app.run() + +if __name__ == "__main__": + # arguments? + run() diff --git a/webconsole/brozzler-webconsole/static/brozzler.svg b/brozzler/webconsole/static/brozzler.svg similarity index 100% rename from webconsole/brozzler-webconsole/static/brozzler.svg rename to brozzler/webconsole/static/brozzler.svg diff --git a/webconsole/brozzler-webconsole/static/js/app.js b/brozzler/webconsole/static/js/app.js similarity index 94% rename from webconsole/brozzler-webconsole/static/js/app.js rename to brozzler/webconsole/static/js/app.js index f74c3f9..ebd6a34 100644 --- a/webconsole/brozzler-webconsole/static/js/app.js +++ b/brozzler/webconsole/static/js/app.js @@ -127,8 +127,11 @@ function loadSiteStats($http, site, job) { // look at Warcprox-Meta to find stats bucket for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) { - if (site.warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) { - var bucket = site.warcprox_meta.stats.buckets[j]; + var bucket = site.warcprox_meta.stats.buckets[j]; + if (typeof(bucket) == "object") { + bucket = bucket["bucket"]; + } + if (bucket.indexOf("seed") >= 0) { // console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket); $http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket)); } @@ -146,7 +149,11 @@ brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$htt $scope.job = data; $scope.job.page_count = $scope.job.queued_count = 0; // console.log("job=", $scope.job); - $http.get("/api/stats/" + $scope.job.conf.warcprox_meta.stats.buckets[0]).success(function(data) { + var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0]; + if (typeof(bucket) == "object") { + bucket = bucket["bucket"]; + } + $http.get("/api/stats/" + bucket).success(function(data) { $scope.job.stats = data; // console.log("job stats=", $scope.job.stats); }); diff --git a/webconsole/brozzler-webconsole/static/partials/home.html b/brozzler/webconsole/static/partials/home.html similarity index 100% rename from webconsole/brozzler-webconsole/static/partials/home.html rename to brozzler/webconsole/static/partials/home.html diff --git a/webconsole/brozzler-webconsole/static/partials/job.html b/brozzler/webconsole/static/partials/job.html similarity index 100% rename from webconsole/brozzler-webconsole/static/partials/job.html rename to brozzler/webconsole/static/partials/job.html diff --git a/webconsole/brozzler-webconsole/static/partials/site.html b/brozzler/webconsole/static/partials/site.html similarity index 100% rename from webconsole/brozzler-webconsole/static/partials/site.html rename to brozzler/webconsole/static/partials/site.html diff --git a/webconsole/brozzler-webconsole/static/partials/workers.html b/brozzler/webconsole/static/partials/workers.html similarity index 100% rename from webconsole/brozzler-webconsole/static/partials/workers.html rename to brozzler/webconsole/static/partials/workers.html diff --git a/webconsole/brozzler-webconsole/templates/index.html b/brozzler/webconsole/templates/index.html similarity index 100% rename from webconsole/brozzler-webconsole/templates/index.html rename to brozzler/webconsole/templates/index.html diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index a8ab831..0000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -FROM phusion/baseimage -MAINTAINER Noah Levitt - -ENV LANG=C.UTF-8 - -RUN apt-get update && apt-get --auto-remove -y dist-upgrade -RUN apt-get -y install vnc4server -RUN apt-get -y install chromium-browser -RUN apt-get -y install xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala -RUN apt-get -y install python3-pip git vlc -RUN apt-get -y install libjpeg-turbo8-dev zlib1g-dev -RUN pip3 install websockify - -RUN adduser --disabled-password --gecos="Charlie Brozzler" brozzler - -RUN mkdir -vp /etc/service/vncserver -ADD vncserver.sh /etc/service/vncserver/run - -RUN mkdir -vp /etc/service/vnc-websock -ADD vnc-websock.sh /etc/service/vnc-websock/run - -EXPOSE 5901 8901 -EXPOSE 8080 - -# RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git - diff --git a/docker/vnc-websock.sh b/docker/vnc-websock.sh deleted file mode 100755 index 4fae773..0000000 --- a/docker/vnc-websock.sh +++ /dev/null @@ -1,2 +0,0 @@ -#!/bin/sh -exec setuser brozzler websockify 0.0.0.0:8901 localhost:5901 diff --git a/docker/vncserver.sh b/docker/vncserver.sh deleted file mode 100755 index b980f6e..0000000 --- a/docker/vncserver.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -# https://github.com/phusion/baseimage-docker#adding-additional-daemons -# /usr/bin/vncserver backgrounds the Xvnc4 process, so we run Xvnc4 directly - -# password_file=/tmp/vnc-passwd -# /bin/echo -ne '\x95\x3f\x23\x7a\x76\x2a\x05\x89' > $password_file -# exec setuser brozzler Xvnc4 :1 -desktop brozzler@`hostname`:1 -auth /tmp/Xauthority.brozzler -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -rfbauth $password_file -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb >> /tmp/`hostname`:1.log 2>&1 - -# exec setuser brozzler Xvnc4 :1 -desktop brozzler@`hostname`:1 -auth /tmp/Xauthority.brozzler -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 >> /tmp/`hostname`:1.log 2>&1 -exec setuser brozzler Xvnc4 :1 -desktop brozzler@`hostname`:1 -auth /tmp/Xauthority.brozzler -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb >> /tmp/`hostname`:1.log 2>&1 - diff --git a/no-docker/README.rst b/no-docker/README.rst deleted file mode 100644 index b863649..0000000 --- a/no-docker/README.rst +++ /dev/null @@ -1,7 +0,0 @@ -Chromium seemed to be dying more often when running in a docker container. - -To start the services brozzler-worker depends on: -/home/nlevitt/workspace/brozzler/no-docker/vncserver.sh & /home/nlevitt/workspace/brozzler/no-docker/vnc-websock.sh & - -Prerequisites: -apt-get -y install vnc4server chromium-browser xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala python3-pip git libjpeg-turbo8-dev zlib1g-dev diff --git a/no-docker/vnc-websock.sh b/no-docker/vnc-websock.sh deleted file mode 100755 index 903068d..0000000 --- a/no-docker/vnc-websock.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash -script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -PYTHONPATH=/home/nlevitt/workspace/websockify/websockify-ve34/lib/python3.4/site-packages:/home/nlevitt/workspace/websockify exec /home/nlevitt/workspace/websockify/websockify-ve34/bin/websockify 0.0.0.0:8901 localhost:5901 >> /home/nlevitt/workspace/brozzler/no-docker/websockify-`hostname -s`.out 2>&1 diff --git a/no-docker/vncserver.sh b/no-docker/vncserver.sh deleted file mode 100755 index ae31e98..0000000 --- a/no-docker/vncserver.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -exec Xvnc4 :1 -auth /tmp/Xauthority.$USER -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 >> $script_dir/Xvnc4-`hostname -s`:1.out 2>&1 - diff --git a/setup.py b/setup.py index 7f6255c..10ab003 100644 --- a/setup.py +++ b/setup.py @@ -1,27 +1,27 @@ -# -# setup.py - brozzler setup script -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# +#!/usr/bin/env python +''' +setup.py - brozzler setup script + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' import setuptools -import glob setuptools.setup( name='brozzler', - version='1.1.dev20', + version='1.1.dev26', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -30,7 +30,15 @@ setuptools.setup( license='Apache License 2.0', packages=['brozzler'], package_data={'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml']}, - scripts=glob.glob('bin/*'), + entry_points={ + 'console_scripts': [ + 'brozzle-page=brozzler.cli:brozzle_page', + 'brozzler-new-job=brozzler.cli:brozzler_new_job', + 'brozzler-new-site=brozzler.cli:brozzler_new_site', + 'brozzler-worker=brozzler.cli:brozzler_worker', + 'brozzler-webconsole=brozzler.webconsole:run', + ], + }, install_requires=[ 'PyYAML', 'youtube-dl', @@ -38,11 +46,15 @@ setuptools.setup( 'requests', 'websocket-client', 'pillow', - 'surt>=0.3b2', - 'rethinkstuff', + 'surt>=0.3.0', + 'rethinkstuff>=0.1.5', 'rethinkdb>=2.3,<2.4', 'psutil', ], + extras_require={ + 'webconsole': ['flask>=0.11', 'gunicorn'], + # 'brozzler-easy': ['warcprox', 'pywb'], + }, zip_safe=False, classifiers=[ 'Development Status :: 4 - Beta', diff --git a/webconsole/README.rst b/webconsole/README.rst deleted file mode 100644 index 659fbfd..0000000 --- a/webconsole/README.rst +++ /dev/null @@ -1 +0,0 @@ -gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app diff --git a/webconsole/brozzler-webconsole/static/noVNC b/webconsole/brozzler-webconsole/static/noVNC deleted file mode 160000 index 6a90803..0000000 --- a/webconsole/brozzler-webconsole/static/noVNC +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6a90803feb124791960e3962e328aa3cfb729aeb diff --git a/webconsole/requirements.txt b/webconsole/requirements.txt deleted file mode 100644 index 1255527..0000000 --- a/webconsole/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -rethinkstuff>=0.1.5 -flask>=0.11 -gunicorn -PyYAML