Merge branch 'master' into qa

* master:
  move behavior_parameters into top level of site configuration
  install the virtualenv package with pip because the apt version is old and conflicts with the recent version of pip we're using
  logging tweak
  rename webconsole to dashboard
  add login details to behavior parameters
  initial login additions
This commit is contained in:
Noah Levitt 2016-11-07 18:16:18 -08:00
commit fbd540244b
34 changed files with 129 additions and 108 deletions

4
.gitmodules vendored
View File

@ -1,3 +1,3 @@
[submodule "brozzler/webconsole/static/noVNC"]
path = brozzler/webconsole/static/noVNC
[submodule "noVNC"]
path = brozzler/dashboard/static/noVNC
url = https://github.com/kanaka/noVNC.git

View File

@ -33,7 +33,7 @@ Getting Started
The easiest way to get started with brozzler for web archiving is with
``brozzler-easy``. Brozzler-easy runs brozzler-worker, warcprox,
`pywb <https://github.com/ikreymer/pywb>`_, and brozzler-webconsole, configured
`pywb <https://github.com/ikreymer/pywb>`_, and brozzler-dashboard, configured
to work with each other, in a single process.
Mac instructions:
@ -118,24 +118,24 @@ must be specified, everything else is optional. For details, see
scope:
surt: http://(org,example,
Brozzler Web Console
--------------------
Brozzler Dashboard
------------------
Brozzler comes with a rudimentary web application for viewing crawl job status.
To install the brozzler with dependencies required to run this app, run
::
pip install brozzler[webconsole]
pip install brozzler[dashboard]
To start the app, run
::
brozzler-webconsole
brozzler-dashboard
See ``brozzler-webconsole --help`` for configuration options.
See ``brozzler-dashboard --help`` for configuration options.
Headless Chromium
-----------------

View File

@ -9,7 +9,7 @@ localhost
[brozzler-worker]
localhost
[brozzler-webconsole]
[brozzler-dashboard]
localhost
[pywb]

View File

@ -16,7 +16,7 @@ work_dir=/vagrant
[brozzler-worker]
10.9.9.9
[brozzler-webconsole]
[brozzler-dashboard]
10.9.9.9
[pywb]

View File

@ -14,10 +14,10 @@
roles:
- brozzler-worker
- name: deploy brozzler-webconsole
hosts: brozzler-webconsole
- name: deploy brozzler-dashboard
hosts: brozzler-dashboard
roles:
- brozzler-webconsole
- brozzler-dashboard
- name: deploy pywb
hosts: pywb

View File

@ -0,0 +1,4 @@
---
- name: restart brozzler-dashboard
service: name=brozzler-dashboard state=restarted
become: true

View File

@ -0,0 +1,20 @@
---
- name: mkdir {{venv_root}}/brozzler-dashboard-ve34
file: path={{venv_root}}/brozzler-dashboard-ve34 state=directory
owner={{user}}
become: true
- name: install brozzler[dashboard] in virtualenv
pip: name='{{brozzler_pip_name}}[dashboard]'
virtualenv={{venv_root}}/brozzler-dashboard-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true
become_user: '{{user}}'
notify:
- restart brozzler-dashboard
- name: install upstart config /etc/init/brozzler-dashboard.conf
become: true
template: src=templates/brozzler-dashboard.conf.j2
dest=/etc/init/brozzler-dashboard.conf
notify:
- restart brozzler-dashboard

View File

@ -1,10 +1,10 @@
description "brozzler-webconsole"
description "brozzler-dashboard"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH={{venv_root}}/brozzler-webconsole-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/brozzler-webconsole-ve34/bin:/usr/bin:/bin
env PYTHONPATH={{venv_root}}/brozzler-dashboard-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/brozzler-dashboard-ve34/bin:/usr/bin:/bin
env LC_ALL=C.UTF-8
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
@ -15,4 +15,4 @@ setuid {{user}}
console log
exec gunicorn --bind=0.0.0.0:8881 brozzler.webconsole:app
exec gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app

View File

@ -1,4 +0,0 @@
---
- name: restart brozzler-webconsole
service: name=brozzler-webconsole state=restarted
become: true

View File

@ -1,20 +0,0 @@
---
- name: mkdir {{venv_root}}/brozzler-webconsole-ve34
file: path={{venv_root}}/brozzler-webconsole-ve34 state=directory
owner={{user}}
become: true
- name: install brozzler[webconsole] in virtualenv
pip: name='{{brozzler_pip_name}}[webconsole]'
virtualenv={{venv_root}}/brozzler-webconsole-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true
become_user: '{{user}}'
notify:
- restart brozzler-webconsole
- name: install upstart config /etc/init/brozzler-webconsole.conf
become: true
template: src=templates/brozzler-webconsole.conf.j2
dest=/etc/init/brozzler-webconsole.conf
notify:
- restart brozzler-webconsole

View File

@ -9,7 +9,6 @@
become: true
apt: name={{item}} state=present
with_items:
- python-virtualenv
- vnc4server
- chromium-browser
- xfonts-base

View File

@ -10,6 +10,6 @@ console log
env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin
# port 8901 is hard-coded in brozzler/webconsole/static/partials/workers.html
# port 8901 is hard-coded in brozzler/dashboard/static/partials/workers.html
exec nice websockify 0.0.0.0:8901 localhost:5901

View File

@ -1,24 +1,28 @@
---
## # get latest pip (had problems with version from apt-get, specifically
## # "pip install pyopenssl" did not install the dependency "cryptography")
## # http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
## - name: install setuptools for python 2 and 3
## become: true
## apt: name={{item}} state=present
## with_items:
## - python-setuptools
## - python3-setuptools
## - name: download pip-8.1.2.tar.gz
## get_url:
## url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz
## dest: /tmp
## checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7
## - name: extract pip-8.1.2.tar.gz
## unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no
## - name: run "python3 setup.py install" in /tmp/pip-8.1.2
## command: python3 setup.py install chdir=/tmp/pip-8.1.2
## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
## become: true
# get latest pip (had problems with version from apt-get, specifically
# "pip install pyopenssl" did not install the dependency "cryptography")
# http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
- name: install setuptools for python 2 and 3
become: true
apt: name={{item}} state=present
with_items:
- python-setuptools
- python3-setuptools
- name: download pip-8.1.2.tar.gz
get_url:
url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz
dest: /tmp
checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7
- name: extract pip-8.1.2.tar.gz
unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no
- name: run "python3 setup.py install" in /tmp/pip-8.1.2
command: python3 setup.py install chdir=/tmp/pip-8.1.2
creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
become: true
- name: run "pip install virtualenv"
command: pip install virtualenv
creates=/usr/local/lib/python3.4/dist-packages/virtualenv.py
become: true
- command: id {{user}}
register: id_user
ignore_errors: true

View File

@ -4,7 +4,6 @@
apt: name={{item}} state=present
with_items:
- gcc
- python-virtualenv
- python3.4
- libpython3.4-dev
- libffi-dev

12
brozzler/cli.py Normal file → Executable file
View File

@ -120,6 +120,12 @@ def brozzle_page():
'-e', '--chrome-exe', dest='chrome_exe',
default=suggest_default_chrome_exe(),
help='executable to use to invoke chrome')
arg_parser.add_argument(
'--behavior-parameters', dest='behavior_parameters',
default=None, help=(
'json blob of parameters to populate the javascript behavior '
'template, e.g. {"parameter_username":"x",'
'"parameter_password":"y"}'))
arg_parser.add_argument(
'--proxy', dest='proxy', default=None,
help='http proxy')
@ -133,9 +139,13 @@ def brozzle_page():
args = arg_parser.parse_args(args=sys.argv[1:])
_configure_logging(args)
behavior_parameters = {}
if args.behavior_parameters:
behavior_parameters = json.loads(args.behavior_parameters)
site = brozzler.Site(
id=-1, seed=args.url, proxy=args.proxy,
enable_warcprox_features=args.enable_warcprox_features)
enable_warcprox_features=args.enable_warcprox_features,
behavior_parameters=behavior_parameters)
page = brozzler.Page(url=args.url, site_id=site.id)
worker = brozzler.BrozzlerWorker(frontier=None)

View File

@ -1,6 +1,6 @@
'''
brozzler/webconsole/__init__.py - flask app for brozzler web console, defines
api endspoints etc
brozzler/dashboard/__init__.py - flask app for brozzler dashboard, defines api
endspoints etc
Copyright (C) 2014-2016 Internet Archive
@ -24,7 +24,7 @@ try:
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
'brozzler[webconsole]".\nSee README.rst for more information.',
'brozzler[dashboard]".\nSee README.rst for more information.',
type(e).__name__, e)
sys.exit(1)
import rethinkstuff
@ -210,11 +210,11 @@ try:
import gunicorn.app.base
from gunicorn.six import iteritems
class GunicornBrozzlerWebConsole(gunicorn.app.base.BaseApplication):
class GunicornBrozzlerDashboard(gunicorn.app.base.BaseApplication):
def __init__(self, app, options=None):
self.options = options or {}
self.application = app
super(GunicornBrozzlerWebConsole, self).__init__()
super(GunicornBrozzlerDashboard, self).__init__()
def load_config(self):
config = dict(
@ -227,12 +227,12 @@ try:
return self.application
def run(**options):
logging.info('running brozzler-webconsole using gunicorn')
GunicornBrozzlerWebConsole(app, options).run()
logging.info('running brozzler-dashboard using gunicorn')
GunicornBrozzlerDashboard(app, options).run()
except ImportError:
def run():
logging.info('running brozzler-webconsole using simple flask app.run')
logging.info('running brozzler-dashboard using simple flask app.run')
app.run()
def main():
@ -241,10 +241,10 @@ def main():
prog=os.path.basename(sys.argv[0]),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=(
'brozzler-webconsole - web application for viewing brozzler '
'brozzler-dashboard - web application for viewing brozzler '
'crawl status'),
epilog=(
'brozzler-webconsole has no command line options, but can be '
'brozzler-dashboard has no command line options, but can be '
'configured using the following environment variables:\n\n'
' RETHINKDB_SERVERS rethinkdb servers, e.g. db0.foo.org,'
'db0.foo.org:38015,db1.foo.org (default: localhost)\n'

View File

Before

Width:  |  Height:  |  Size: 9.1 KiB

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -1,5 +1,5 @@
/*
* brozzler-webconsole/static/js/app.js - brozzler console angularjs code
* brozzler/dashboard/static/js/app.js - brozzler dashboard angularjs code
*
* Copyright (C) 2014-2016 Internet Archive
*
@ -18,12 +18,12 @@
"use strict";
var brozzlerConsoleApp = angular.module("brozzlerConsoleApp", [
var brozzlerDashboardApp = angular.module("brozzlerDashboardApp", [
"ngRoute",
"brozzlerControllers",
]);
brozzlerConsoleApp.config(["$routeProvider", "$locationProvider",
brozzlerDashboardApp.config(["$routeProvider", "$locationProvider",
function($routeProvider, $locationProvider) {
$routeProvider.
when("/workers", {
@ -53,7 +53,7 @@ brozzlerConsoleApp.config(["$routeProvider", "$locationProvider",
}]);
// copied from https://bitbucket.org/webarchive/ait5/src/master/archiveit/static/app/js/filters/ByteFormat.js
brozzlerConsoleApp.filter("byteformat", function() {
brozzlerDashboardApp.filter("byteformat", function() {
return function(bytes, precision) {
var bytes_f = parseFloat(bytes);
if (bytes_f == 0 || isNaN(bytes_f) || !isFinite(bytes_f)) return "0";

@ -0,0 +1 @@
Subproject commit ef887cdb123df21b61043ff025e6208631e9eb7b

View File

@ -1,12 +1,12 @@
<!doctype html>
<html lang="en" ng-app="brozzlerConsoleApp">
<html lang="en" ng-app="brozzlerDashboardApp">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
<meta name="apple-mobile-web-app-capable" content="yes" />
<meta name="apple-mobile-web-app-status-bar-style" content="black-translucent" />
<title>Brozzler Console</title>
<title>Brozzler Dashboard</title>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.5/css/bootstrap.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/3.3.5/css/bootstrap-theme.css">
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.4.0/css/font-awesome.css">

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
'''
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-webconsole all
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
working together in a single process
Copyright (C) 2016 Internet Archive
@ -27,7 +27,7 @@ try:
import brozzler.pywb
import wsgiref.simple_server
import wsgiref.handlers
import brozzler.webconsole
import brozzler.dashboard
except ImportError as e:
logging.critical(
'%s: %s\n\nYou might need to run "pip install '
@ -51,7 +51,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
prog=prog, formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description=(
'brozzler-easy - easy deployment of brozzler, with '
'brozzler-worker, warcprox, pywb, and brozzler-webconsole all '
'brozzler-worker, warcprox, pywb, and brozzler-dashboard all '
'running in a single process'))
# common args
@ -104,14 +104,14 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
'--pywb-port', dest='pywb_port', type=int,
default=8880, help='pywb wayback port')
# webconsole args
# dashboard args
arg_parser.add_argument(
'--webconsole-address', dest='webconsole_address',
'--dashboard-address', dest='dashboard_address',
default='localhost',
help='brozzler web console address to listen on')
help='brozzler dashboard address to listen on')
arg_parser.add_argument(
'--webconsole-port', dest='webconsole_port',
type=int, default=8881, help='brozzler web console port')
'--dashboard-port', dest='dashboard_port',
type=int, default=8881, help='brozzler dashboard port')
# common at the bottom args
arg_parser.add_argument(
@ -143,12 +143,12 @@ class BrozzlerEasyController:
self._warcprox_args(args))
self.brozzler_worker = self._init_brozzler_worker(args)
self.pywb_httpd = self._init_pywb(args)
self.webconsole_httpd = self._init_brozzler_webconsole(args)
self.dashboard_httpd = self._init_brozzler_dashboard(args)
def _init_brozzler_webconsole(self, args):
def _init_brozzler_dashboard(self, args):
return wsgiref.simple_server.make_server(
args.webconsole_address, args.webconsole_port,
brozzler.webconsole.app, ThreadingWSGIServer)
args.dashboard_address, args.dashboard_port,
brozzler.dashboard.app, ThreadingWSGIServer)
def _init_brozzler_worker(self, args):
r = rethinkstuff.Rethinker(
@ -212,13 +212,13 @@ class BrozzlerEasyController:
threading.Thread(target=self.pywb_httpd.serve_forever).start()
self.logger.info(
'starting brozzler-webconsole at %s:%s',
*self.webconsole_httpd.server_address)
threading.Thread(target=self.webconsole_httpd.serve_forever).start()
'starting brozzler-dashboard at %s:%s',
*self.dashboard_httpd.server_address)
threading.Thread(target=self.dashboard_httpd.serve_forever).start()
def shutdown(self):
self.logger.info('shutting down brozzler-webconsole')
self.webconsole_httpd.shutdown()
self.logger.info('shutting down brozzler-dashboard')
self.dashboard_httpd.shutdown()
self.logger.info('shutting down brozzler-worker')
self.brozzler_worker.shutdown_now()

View File

@ -75,6 +75,8 @@ def new_job(frontier, job_conf):
sites = []
for seed_conf in job_conf["seeds"]:
merged_conf = merge(seed_conf, job_conf)
if "login" in merged_conf and "metadata" in merged_conf:
merged_conf["metadata"]["login"] = merged_conf["login"]
site = brozzler.Site(
job_id=job.id, seed=merged_conf["url"],
scope=merged_conf.get("scope"),

View File

@ -69,6 +69,9 @@ id:
user_agent:
type: string
behavior_parameters:
type: dict
seeds:
type: list
required: true

View File

@ -96,7 +96,7 @@ class Site(brozzler.BaseDictable):
status="ACTIVE", claimed=False, start_time=None,
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
cookie_db=None, user_agent=None):
cookie_db=None, user_agent=None, behavior_parameters=None):
self.seed = seed
self.id = id
@ -117,6 +117,7 @@ class Site(brozzler.BaseDictable):
self.remember_outlinks = remember_outlinks
self.cookie_db = cookie_db
self.user_agent = user_agent
self.behavior_parameters = behavior_parameters
self.scope = scope or {}
if not "surt" in self.scope:

@ -1 +0,0 @@
Subproject commit 6a90803feb124791960e3962e328aa3cfb729aeb

View File

@ -273,6 +273,7 @@ class BrozzlerWorker:
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
outlinks = browser.browse_page(
page.url, extra_headers=site.extra_headers(),
behavior_parameters=site.behavior_parameters,
user_agent=site.user_agent,
on_screenshot=_on_screenshot,
on_url_change=page.note_redirect)
@ -388,7 +389,9 @@ class BrozzlerWorker:
try:
site = self._frontier.claim_site("{}:{}".format(
socket.gethostname(), browser.chrome_port))
self.logger.info("brozzling site %s", site)
self.logger.info(
"brozzling site (proxy=%s) %s",
repr(self._proxy(site)), site)
th = threading.Thread(
target=lambda: self._brozzle_site(
browser, site),

View File

@ -32,17 +32,17 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b7.dev109',
version='1.1b7.dev113',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
author_email='nlevitt@archive.org',
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
license='Apache License 2.0',
packages=['brozzler', 'brozzler.webconsole'],
packages=['brozzler', 'brozzler.dashboard'],
package_data={
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
'brozzler.webconsole': find_package_data('brozzler.webconsole'),
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
},
entry_points={
'console_scripts': [
@ -51,7 +51,7 @@ setuptools.setup(
'brozzler-new-site=brozzler.cli:brozzler_new_site',
'brozzler-worker=brozzler.cli:brozzler_worker',
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
'brozzler-webconsole=brozzler.webconsole:main',
'brozzler-dashboard=brozzler.dashboard:main',
'brozzler-easy=brozzler.easy:main',
'brozzler-wayback=brozzler.pywb:main',
],
@ -70,7 +70,7 @@ setuptools.setup(
'cerberus==1.0.1',
],
extras_require={
'webconsole': ['flask>=0.11', 'gunicorn'],
'dashboard': ['flask>=0.11', 'gunicorn'],
'easy': ['warcprox>=2.0b1', 'pywb', 'flask>=0.11', 'gunicorn'],
},
zip_safe=False,

View File

@ -86,7 +86,7 @@ def test_services_up():
# if the connect fails an exception is raised and the test fails
s.connect(('localhost', 8880))
# check that brozzler webconsole is listening
# check that brozzler dashboard is listening
with socket.socket() as s:
# if the connect fails an exception is raised and the test fails
s.connect(('localhost', 8881))

View File

@ -6,7 +6,7 @@ echo service status:
vagrant ssh -- 'status warcprox ;
status Xvnc ;
status brozzler-worker ;
status brozzler-webconsole ;
status brozzler-dashboard ;
status vnc-websock'
echo