Merge branch 'master' into qa

* master:
  handle "undefined" in list of frames when extracting outlinks (fixes ARI-4988)
  avoid hanging in case a page has no outlinks
  fix noVNC submodule path since brozzler webconsole has moved
  handle new bucket format in brozzler-webconsole
  fix brozzler.svg symlink
  convert command-line executables to entry_points console_scripts, best practice according to Python Packaging Authority (eases testing, etc)
  make brozzler-webconsole a part of the main brozzler package, using optional "extras_require" dependencies
  remove crufty docker and no-docker scripts
  note python 3.4 requirement in readme
This commit is contained in:
Noah Levitt 2016-06-28 12:26:38 -05:00
commit 6674c96bc6
27 changed files with 383 additions and 404 deletions

4
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "webconsole/brozzler-webconsole/static/noVNC"]
path = webconsole/brozzler-webconsole/static/noVNC
[submodule "brozzler/webconsole/static/noVNC"]
path = brozzler/webconsole/static/noVNC
url = https://github.com/kanaka/noVNC.git

View File

@ -20,6 +20,8 @@ archiving.
Installation
------------
Brozzler requires python 3.4 or later.
::
# set up virtualenv if desired
@ -68,6 +70,26 @@ must be specified, everything else is optional.
scope:
surt: http://(org,example,
Brozzler Web Console
--------------------
Brozzler comes with a rudimentary web application for viewing crawl job status.
To install the brozzler with dependencies required to run this app, run
::
pip install brozzler[webconsole]
To start the app, run
::
brozzler-webconsole
XXX configuration stuff
Fonts (for decent screenshots)
------------------------------

View File

@ -1,79 +0,0 @@
#!/usr/bin/env python
#
# brozzle-page - command line utility for brozzling a single page, i.e. opening
# it in a browser, running some javascript behaviors, and printing outlinks
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import sys
import logging
import brozzler
import re
import warnings
import requests
import string
import datetime
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description="brozzle-page - brozzle a single page",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('url', metavar='URL', help='page url')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
arg_parser.add_argument("-v", "--verbose", dest="log_level",
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument("--version", action="version",
version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__)))
args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
site = brozzler.Site(
id=-1, seed=args.url, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features)
page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None)
ydl = worker._youtube_dl(site)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
filename = "/tmp/{}-{:%Y%m%d%H%M%S}.png".format(
"".join(ch if ch in OK_CHARS else "_" for ch in args.url),
datetime.datetime.now())
# logging.info("len(screenshot_png)=%s", len(screenshot_png))
with open(filename, 'wb') as f:
f.write(screenshot_png)
logging.info("wrote screenshot to %s", filename)
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
browser.start(proxy=site.proxy)
try:
outlinks = worker.brozzle_page(
browser, ydl, site, page, on_screenshot=on_screenshot)
logging.info("outlinks: \n\t%s", "\n\t".join(sorted(outlinks)))
except brozzler.ReachedLimit as e:
logging.error("reached limit %s", e)
finally:
browser.stop()

View File

@ -1,56 +0,0 @@
#!/usr/bin/env python
#
# brozzler-new-job - takes a yaml brozzler job configuration file, creates
# job, sites, and pages objects in rethinkdb, which brozzler-workers will look
# at and start crawling
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import sys
import logging
import brozzler
import yaml
import json
import rethinkstuff
import warnings
import requests
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description="brozzler-new-job - queue new job with brozzler",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml')
arg_parser.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', default="localhost",
help='rethinkdb servers, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler",
help='rethinkdb database name')
arg_parser.add_argument("-v", "--verbose", dest="log_level",
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument("--version", action="version",
version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__)))
args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)
brozzler.job.new_job_file(frontier, args.job_conf_file)

View File

@ -1,72 +0,0 @@
#!/usr/bin/env python
#
# brozzler-new-site - takes a seed url and creates a site and page object in
# rethinkdb, which brozzler-workers will look at and start crawling
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import sys
import logging
import brozzler
import re
import rethinkstuff
import warnings
import requests
import json
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
description="brozzler-new-site - register site to brozzle",
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
arg_parser.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', default="localhost",
help='rethinkdb servers, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler",
help='rethinkdb database name')
arg_parser.add_argument("--proxy", dest="proxy", default=None, help="http proxy for this site")
arg_parser.add_argument("--time-limit", dest="time_limit", default=None, help="time limit in seconds for this site")
arg_parser.add_argument("--ignore-robots", dest="ignore_robots",
action="store_true", help="ignore robots.txt for this site")
arg_parser.add_argument('--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help='enable special features for this site that assume the configured proxy is warcprox')
arg_parser.add_argument(
'--warcprox-meta', dest='warcprox_meta',
help='Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are enabled')
arg_parser.add_argument("-v", "--verbose", dest="log_level",
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument("--version", action="version",
version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__)))
args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format="%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s")
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
site = brozzler.Site(
seed=args.seed, proxy=args.proxy,
time_limit=int(args.time_limit) if args.time_limit else None,
ignore_robots=args.ignore_robots,
enable_warcprox_features=args.enable_warcprox_features,
warcprox_meta=json.loads(args.warcprox_meta) if args.warcprox_meta else None)
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)

View File

@ -1,101 +0,0 @@
#!/usr/bin/env python
#
# brozzler-worker - main entrypoint for brozzler, gets sites and pages to
# brozzle from rethinkdb, brozzles them
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import os
import sys
import logging
import brozzler
import brozzler.worker
import threading
import time
import signal
import pprint
import traceback
import rethinkstuff
import warnings
import requests
arg_parser = argparse.ArgumentParser(prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('--rethinkdb-servers', dest='rethinkdb_servers', default="localhost",
help='rethinkdb servers, e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org')
arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default="brozzler",
help='rethinkdb database name')
arg_parser.add_argument('-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument('-n', '--max-browsers', dest='max_browsers', default='1',
help='max number of chrome instances simultaneously browsing pages')
arg_parser.add_argument('-v', '--verbose', dest='log_level',
action="store_const", default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument('--version', action='version',
version="brozzler {} - {}".format(brozzler.__version__, os.path.basename(__file__)))
args = arg_parser.parse_args(args=sys.argv[1:])
logging.basicConfig(stream=sys.stdout, level=args.log_level,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
logging.getLogger("requests.packages.urllib3").setLevel(logging.WARN)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter("ignore", category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
def sigterm(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
def sigint(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)')
def dump_state(signum, frame):
pp = pprint.PrettyPrinter(indent=4)
state_strs = []
for th in threading.enumerate():
state_strs.append(str(th))
stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append("".join(stack))
logging.warn("dumping state (caught signal {})\n{}".format(
signum, "\n".join(state_strs)))
signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint)
r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)
service_registry = rethinkstuff.ServiceRegistry(r)
worker = brozzler.worker.BrozzlerWorker(
frontier, service_registry, max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe)
worker_thread = worker.start()
try:
while worker_thread.is_alive():
time.sleep(0.5)
logging.critical("worker thread has died, shutting down")
except brozzler.ShutdownRequested as e:
pass
finally:
worker.shutdown_now()
for th in threading.enumerate():
if th != threading.current_thread():
th.join()
logging.info("all done, exiting")

View File

@ -1 +1 @@
webconsole/brozzler-webconsole/static/brozzler.svg
brozzler/webconsole/static/brozzler.svg

View File

@ -273,7 +273,7 @@ class Browser:
or self._waiting_on_screenshot_msg_id):
return False
if self._outlinks:
if self._outlinks is not None:
self.logger.info("got outlinks, finished browsing %s", self.url)
return True
elif not self._waiting_on_outlinks_msg_id:
@ -290,7 +290,9 @@ var compileOutlinks = function(frame) {
var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) {
outlinks = outlinks.concat(compileOutlinks(frame.frames[i]));
if (frame.frames[i]) { // sometimes undefined (why?)
outlinks = outlinks.concat(compileOutlinks(frame.frames[i]));
}
}
return outlinks;
}

266
brozzler/cli.py Normal file
View File

@ -0,0 +1,266 @@
#!/usr/bin/env python
'''
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import argparse
import brozzler
import brozzler.worker
import datetime
import json
import logging
import os
import pprint
import re
import requests
import rethinkstuff
import signal
import string
import sys
import threading
import time
import traceback
import warnings
import yaml
def _add_common_options(arg_parser):
arg_parser.add_argument(
'-v', '--verbose', dest='log_level',
action='store_const', default=logging.INFO, const=logging.DEBUG)
arg_parser.add_argument(
'--version', action='version',
version='brozzler %s - %s' % (
brozzler.__version__, os.path.basename(sys.argv[0])))
def _add_rethinkdb_options(arg_parser):
arg_parser.add_argument(
'--rethinkdb-servers', dest='rethinkdb_servers',
default='localhost', help=(
'rethinkdb servers, e.g. '
'db0.foo.org,db0.foo.org:38015,db1.foo.org'))
arg_parser.add_argument(
'--rethinkdb-db', dest='rethinkdb_db', default='brozzler',
help='rethinkdb database name')
def _add_proxy_options(arg_parser):
arg_parser.add_argument(
'--proxy', dest='proxy', default=None, help='http proxy')
arg_parser.add_argument(
'--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help=(
'enable special features that assume the configured proxy is '
'warcprox'))
def _configure_logging(args):
logging.basicConfig(
stream=sys.stderr, level=args.log_level,
format=(
'%(asctime)s %(process)d %(levelname)s %(threadName)s '
'%(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s'))
logging.getLogger('requests.packages.urllib3').setLevel(logging.WARN)
warnings.simplefilter(
'ignore', category=requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter(
'ignore', category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
def brozzle_page():
'''
Command line utility entry point for brozzling a single page. Opens url in
a browser, running some javascript behaviors, and prints outlinks.
'''
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
description='brozzle-page - brozzle a single page',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('url', metavar='URL', help='page url')
arg_parser.add_argument(
'-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument(
'--proxy', dest='proxy', default=None,
help='http proxy')
arg_parser.add_argument(
'--enable-warcprox-features', dest='enable_warcprox_features',
action='store_true', help=(
'enable special features that assume the configured proxy '
'is warcprox'))
_add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
site = brozzler.Site(
id=-1, seed=args.url, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features)
page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None)
ydl = worker._youtube_dl(site)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
datetime.datetime.now())
# logging.info('len(screenshot_png)=%s', len(screenshot_png))
with open(filename, 'wb') as f:
f.write(screenshot_png)
logging.info('wrote screenshot to %s', filename)
browser = brozzler.Browser(chrome_exe=args.chrome_exe)
browser.start(proxy=site.proxy)
try:
outlinks = worker.brozzle_page(
browser, ydl, site, page, on_screenshot=on_screenshot)
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
except brozzler.ReachedLimit as e:
logging.error('reached limit %s', e)
finally:
browser.stop()
def brozzler_new_job():
'''
Command line utility entry point for queuing a new brozzler job. Takes a
yaml brozzler job configuration file, creates job, sites, and pages objects
in rethinkdb, which brozzler-workers will look at and start crawling.
'''
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
description='brozzler-new-job - queue new job with brozzler',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument(
'job_conf_file', metavar='JOB_CONF_FILE',
help='brozzler job configuration file in yaml')
_add_rethinkdb_options(arg_parser)
_add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(','), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)
brozzler.job.new_job_file(frontier, args.job_conf_file)
def brozzler_new_site():
'''
Command line utility entry point for queuing a new brozzler site.
Takes a seed url and creates a site and page object in rethinkdb, which
brozzler-workers will look at and start crawling.
'''
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(sys.argv[0]),
description='brozzler-new-site - register site to brozzle',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
_add_rethinkdb_options(arg_parser)
_add_proxy_options(arg_parser)
arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None,
help='time limit in seconds for this site')
arg_parser.add_argument(
'--ignore-robots', dest='ignore_robots', action='store_true',
help='ignore robots.txt for this site')
arg_parser.add_argument(
'--warcprox-meta', dest='warcprox_meta',
help=(
'Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are '
'enabled'))
_add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
site = brozzler.Site(
seed=args.seed, proxy=args.proxy,
time_limit=int(args.time_limit) if args.time_limit else None,
ignore_robots=args.ignore_robots,
enable_warcprox_features=args.enable_warcprox_features,
warcprox_meta=(
json.loads(args.warcprox_meta) if args.warcprox_meta else None))
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)
brozzler.new_site(frontier, site)
def brozzler_worker():
'''
Main entrypoint for brozzler, gets sites and pages to brozzle from
rethinkdb, brozzles them.
'''
arg_parser = argparse.ArgumentParser(
prog=os.path.basename(__file__),
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
_add_rethinkdb_options(arg_parser)
arg_parser.add_argument(
'-e', '--executable', dest='chrome_exe', default='chromium-browser',
help='executable to use to invoke chrome')
arg_parser.add_argument(
'-n', '--max-browsers', dest='max_browsers', default='1',
help='max number of chrome instances simultaneously browsing pages')
_add_common_options(arg_parser)
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
def sigterm(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
def sigint(signum, frame):
raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)')
def dump_state(signum, frame):
pp = pprint.PrettyPrinter(indent=4)
state_strs = []
for th in threading.enumerate():
state_strs.append(str(th))
stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append("".join(stack))
logging.warn("dumping state (caught signal {})\n{}".format(
signum, "\n".join(state_strs)))
signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, sigterm)
signal.signal(signal.SIGINT, sigint)
r = rethinkstuff.Rethinker(
args.rethinkdb_servers.split(","), args.rethinkdb_db)
frontier = brozzler.RethinkDbFrontier(r)
service_registry = rethinkstuff.ServiceRegistry(r)
worker = brozzler.worker.BrozzlerWorker(
frontier, service_registry, max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe)
worker_thread = worker.start()
try:
while worker_thread.is_alive():
time.sleep(0.5)
logging.critical("worker thread has died, shutting down")
except brozzler.ShutdownRequested as e:
pass
finally:
worker.shutdown_now()
for th in threading.enumerate():
if th != threading.current_thread():
th.join()
logging.info("brozzler-worker is all done, exiting")

View File

@ -1,5 +1,5 @@
'''
brozzler-webconsole/__init__.py - flask app for brozzler web console, defines
brozzler/webconsole/__init__.py - flask app for brozzler web console, defines
api endspoints etc
Copyright (C) 2014-2016 Internet Archive
@ -17,14 +17,22 @@ See the License for the specific language governing permissions and
limitations under the License.
'''
import flask
import logging
import sys
try:
import flask
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[webconsole]".\nSee README.rst for more information.',
type(e).__name__, e)
sys.exit(1)
import rethinkstuff
import json
import sys
import os
import importlib
import rethinkdb
import logging
import yaml
# flask does its own logging config
@ -157,6 +165,36 @@ def api404(path):
def root(path):
return flask.render_template("index.html")
if __name__ == "__main__":
app.run(host="0.0.0.0", port=8081, debug=True)
try:
import gunicorn.app.base
from gunicorn.six import iteritems
class GunicornBrozzlerWebConsole(gunicorn.app.base.BaseApplication):
def __init__(self, app, options=None):
self.options = options or {}
self.application = app
super(GunicornBrozzlerWebConsole, self).__init__()
def load_config(self):
config = dict(
[(key, value) for key, value in iteritems(self.options)
if key in self.cfg.settings and value is not None])
for key, value in iteritems(config):
self.cfg.set(key.lower(), value)
def load(self):
return self.application
def run(**options):
logging.info('running brozzler-webconsole using gunicorn')
GunicornBrozzlerWebConsole(app, options).run()
except ImportError:
def run():
logging.info('running brozzler-webconsole using simple flask app.run')
app.run()
if __name__ == "__main__":
# arguments?
run()

View File

Before

Width:  |  Height:  |  Size: 9.1 KiB

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -127,8 +127,11 @@ function loadSiteStats($http, site, job) {
// look at Warcprox-Meta to find stats bucket
for (var j = 0; j < site.warcprox_meta.stats.buckets.length; j++) {
if (site.warcprox_meta.stats.buckets[j].indexOf("seed") >= 0) {
var bucket = site.warcprox_meta.stats.buckets[j];
var bucket = site.warcprox_meta.stats.buckets[j];
if (typeof(bucket) == "object") {
bucket = bucket["bucket"];
}
if (bucket.indexOf("seed") >= 0) {
// console.log("warcprox_meta.stats.buckets[" + j + "]=" + bucket);
$http.get("/api/stats/" + bucket).success(statsSuccessCallback(site, bucket));
}
@ -146,7 +149,11 @@ brozzlerControllers.controller("JobController", ["$scope", "$routeParams", "$htt
$scope.job = data;
$scope.job.page_count = $scope.job.queued_count = 0;
// console.log("job=", $scope.job);
$http.get("/api/stats/" + $scope.job.conf.warcprox_meta.stats.buckets[0]).success(function(data) {
var bucket = $scope.job.conf.warcprox_meta.stats.buckets[0];
if (typeof(bucket) == "object") {
bucket = bucket["bucket"];
}
$http.get("/api/stats/" + bucket).success(function(data) {
$scope.job.stats = data;
// console.log("job stats=", $scope.job.stats);
});

View File

@ -1,26 +0,0 @@
FROM phusion/baseimage
MAINTAINER Noah Levitt <nlevitt@archive.org>
ENV LANG=C.UTF-8
RUN apt-get update && apt-get --auto-remove -y dist-upgrade
RUN apt-get -y install vnc4server
RUN apt-get -y install chromium-browser
RUN apt-get -y install xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala
RUN apt-get -y install python3-pip git vlc
RUN apt-get -y install libjpeg-turbo8-dev zlib1g-dev
RUN pip3 install websockify
RUN adduser --disabled-password --gecos="Charlie Brozzler" brozzler
RUN mkdir -vp /etc/service/vncserver
ADD vncserver.sh /etc/service/vncserver/run
RUN mkdir -vp /etc/service/vnc-websock
ADD vnc-websock.sh /etc/service/vnc-websock/run
EXPOSE 5901 8901
EXPOSE 8080
# RUN pip3 install -i http://crawl342.us.archive.org:9000/nlevitt/dev/+simple/ git+https://github.com/nlevitt/brozzler.git

View File

@ -1,2 +0,0 @@
#!/bin/sh
exec setuser brozzler websockify 0.0.0.0:8901 localhost:5901

View File

@ -1,12 +0,0 @@
#!/bin/bash
# https://github.com/phusion/baseimage-docker#adding-additional-daemons
# /usr/bin/vncserver backgrounds the Xvnc4 process, so we run Xvnc4 directly
# password_file=/tmp/vnc-passwd
# /bin/echo -ne '\x95\x3f\x23\x7a\x76\x2a\x05\x89' > $password_file
# exec setuser brozzler Xvnc4 :1 -desktop brozzler@`hostname`:1 -auth /tmp/Xauthority.brozzler -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -rfbauth $password_file -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb >> /tmp/`hostname`:1.log 2>&1
# exec setuser brozzler Xvnc4 :1 -desktop brozzler@`hostname`:1 -auth /tmp/Xauthority.brozzler -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 >> /tmp/`hostname`:1.log 2>&1
exec setuser brozzler Xvnc4 :1 -desktop brozzler@`hostname`:1 -auth /tmp/Xauthority.brozzler -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb >> /tmp/`hostname`:1.log 2>&1

View File

@ -1,7 +0,0 @@
Chromium seemed to be dying more often when running in a docker container.
To start the services brozzler-worker depends on:
/home/nlevitt/workspace/brozzler/no-docker/vncserver.sh & /home/nlevitt/workspace/brozzler/no-docker/vnc-websock.sh &
Prerequisites:
apt-get -y install vnc4server chromium-browser xfonts-base fonts-arphic-bkai00mp fonts-arphic-bsmi00lp fonts-arphic-gbsn00lp fonts-arphic-gkai00mp fonts-arphic-ukai fonts-farsiweb fonts-nafees fonts-sil-abyssinica fonts-sil-ezra fonts-sil-padauk fonts-unfonts-extra fonts-unfonts-core ttf-indic-fonts fonts-thai-tlwg fonts-lklug-sinhala python3-pip git libjpeg-turbo8-dev zlib1g-dev

View File

@ -1,3 +0,0 @@
#!/bin/bash
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PYTHONPATH=/home/nlevitt/workspace/websockify/websockify-ve34/lib/python3.4/site-packages:/home/nlevitt/workspace/websockify exec /home/nlevitt/workspace/websockify/websockify-ve34/bin/websockify 0.0.0.0:8901 localhost:5901 >> /home/nlevitt/workspace/brozzler/no-docker/websockify-`hostname -s`.out 2>&1

View File

@ -1,4 +0,0 @@
#!/bin/bash
script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
exec Xvnc4 :1 -auth /tmp/Xauthority.$USER -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 >> $script_dir/Xvnc4-`hostname -s`:1.out 2>&1

View File

@ -1,27 +1,27 @@
#
# setup.py - brozzler setup script
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#!/usr/bin/env python
'''
setup.py - brozzler setup script
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import setuptools
import glob
setuptools.setup(
name='brozzler',
version='1.1.dev20',
version='1.1.dev26',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -30,7 +30,15 @@ setuptools.setup(
license='Apache License 2.0',
packages=['brozzler'],
package_data={'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml']},
scripts=glob.glob('bin/*'),
entry_points={
'console_scripts': [
'brozzle-page=brozzler.cli:brozzle_page',
'brozzler-new-job=brozzler.cli:brozzler_new_job',
'brozzler-new-site=brozzler.cli:brozzler_new_site',
'brozzler-worker=brozzler.cli:brozzler_worker',
'brozzler-webconsole=brozzler.webconsole:run',
],
},
install_requires=[
'PyYAML',
'youtube-dl',
@ -38,11 +46,15 @@ setuptools.setup(
'requests',
'websocket-client',
'pillow',
'surt>=0.3b2',
'rethinkstuff',
'surt>=0.3.0',
'rethinkstuff>=0.1.5',
'rethinkdb>=2.3,<2.4',
'psutil',
],
extras_require={
'webconsole': ['flask>=0.11', 'gunicorn'],
# 'brozzler-easy': ['warcprox', 'pywb'],
},
zip_safe=False,
classifiers=[
'Development Status :: 4 - Beta',

View File

@ -1 +0,0 @@
gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app

@ -1 +0,0 @@
Subproject commit 6a90803feb124791960e3962e328aa3cfb729aeb

View File

@ -1,4 +0,0 @@
rethinkstuff>=0.1.5
flask>=0.11
gunicorn
PyYAML