Merge branch 'master' into qa

* master:
  document a bunch of job settings
  starting on documenting job configuration
  replace vagrant-brozzler-new-site with python script that fills in default options and passes through others
  add missing rethinkdb config file to ansible config
  new script runs brozzler-new-site queues a new site to brozzle on the vagrant brozzler deployment
  better logs for facebook logins
  working on including pywb in vagrant environment (not finished)
  starting to create a framework for testing
  header comment tweak
  new prog "brozzler-wayback" runs monkey-patched pywb
This commit is contained in:
Noah Levitt 2016-09-30 08:51:17 -07:00
commit 80883c9784
23 changed files with 582 additions and 49 deletions

View File

@ -95,7 +95,8 @@ Job Configuration
Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url
must be specified, everything else is optional.
must be specified, everything else is optional. For details, see
`<job-conf.rst>`_.
::

View File

@ -1,21 +1,21 @@
#
# brozzler/job.py - Job class representing a brozzler crawl job, and functions
# for setting up a job with supplied configuration
#
# Copyright (C) 2014-2016 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
'''
brozzler/job.py - Job class representing a brozzler crawl job, and functions
for setting up a job with supplied configuration
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import logging
import brozzler
@ -87,7 +87,7 @@ def new_site(frontier, site):
frontier.new_page(page)
logging.info("queued page %s", page)
else:
logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
logging.warn("seed url %s is blocked by robots.txt", site.seed)
finally:
# finally block because we want to insert the Site no matter what
frontier.new_site(site)

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
'''
brozzler/pywb.py - pywb support for rethinkdb index
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
loading from warcs still being written to, and canonicalization rules matching
brozzler conventions
Copyright (C) 2016 Internet Archive
@ -35,6 +36,7 @@ import rethinkstuff
import rethinkdb
import surt
import json
import brozzler
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
def __init__(self, servers, db, table):
@ -192,3 +194,13 @@ def support_in_progress_warcs():
results.append('%s.open' % warc_path)
return results
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
def main(argv=sys.argv):
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
brozzler.pywb.support_in_progress_warcs()
wayback_cli = pywb.apps.cli.WaybackCli(
args=argv[1:], default_port=8880,
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
'with brozzler)'))
wayback_cli.run()

View File

@ -12,11 +12,19 @@
<div>
<h2>Workers</h2>
<p><i>This page depends on some deployment details outside of brozzler
itself, namely that port 8901 on each brozzler-worker is running
websockify bridging VNC running on the same host. The vagrant+ansible
configuration in the brozzler repo contains an example of that.
https://github.com/internetarchive/brozzler/tree/master/vagrant
</i></p>
<div class="row">
<div class="col-xs-12 col-sm-12 col-md-6 col-lg-6" ng-repeat="worker in workers">
<div>{{worker}}</div>
<div>{{worker.host}}</div>
<iframe style="width:45rem;height:32rem;"
ng-src="{{'/static/noVNC/vnc.html?host=' + worker.host + '&port=8901&autoconnect=1&resize=downscale'}}">
<iframe style="width:45rem;height:32rem;"
ng-src="{{'/static/noVNC/vnc.html?host=' + worker.host + '&port=8901&autoconnect=1&resize=downscale'}}">
</iframe>
</div>
</div>

205
job-conf.rst Normal file
View File

@ -0,0 +1,205 @@
brozzler job configuration
**************************
Jobs are defined using yaml files. Options may be specified either at the
top-level or on individual seeds. A job id and at least one seed url
must be specified, everything else is optional.
an example
==========
::
id: myjob
time_limit: 60 # seconds
proxy: 127.0.0.1:8000 # point at warcprox for archiving
ignore_robots: false
enable_warcprox_features: false
warcprox_meta:
warc-prefix: job1
stats:
buckets:
- job1-stats
metadata: {}
seeds:
- url: http://one.example.org/
warcprox_meta:
warc-prefix: job1-seed1
stats:
buckets:
- job1-seed1-stats
- url: http://two.example.org/
time_limit: 30
- url: http://three.example.org/
time_limit: 10
ignore_robots: true
scope:
surt: http://(org,example,
how inheritance works
=====================
Most of the available options apply to seeds. Such options can also be
specified at the top level, in which case the seeds inherit the options. If
an option is specified both at the top level and at the level of an individual
seed, the results are merged with the seed-level value taking precedence in
case of conflicts. It's probably easiest to make sense of this by way of an
example.
In the example yaml above, ``warcprox_meta`` is specified at the top level and
at the seed level for the seed http://one.example.org/. At the top level we
have::
warcprox_meta:
warc-prefix: job1
stats:
buckets:
- job1-stats
At the seed level we have::
warcprox_meta:
warc-prefix: job1-seed1
stats:
buckets:
- job1-seed1-stats
The merged configuration as applied to the seed http://one.example.org/ will
be::
warcprox_meta:
warc-prefix: job1-seed1
stats:
buckets:
- job1-stats
- job1-seed1-stats
Notice that:
- There is a collision on ``warc-prefix`` and the seed-level value wins.
- Since ``buckets`` is a list, the merged result includes all the values from
both the top level and the seed level.
settings reference
==================
id
--
+-----------+--------+----------+---------+
| scope | type | required | default |
+===========+========+==========+=========+
| top-level | string | yes? | *n/a* |
+-----------+--------+----------+---------+
An arbitrary identifier for this job. Must be unique across this deployment of
brozzler.
seeds
-----
+-----------+------------------------+----------+---------+
| scope | type | required | default |
+===========+========================+==========+=========+
| top-level | list (of dictionaries) | yes | *n/a* |
+-----------+------------------------+----------+---------+
List of seeds. Each item in the list is a dictionary (associative array) which
defines the seed. It must specify ``url`` (see below) and can additionally
specify any of the settings of scope *seed-level*.
url
---
+------------+--------+----------+---------+
| scope | type | required | default |
+============+========+==========+=========+
| seed-level | string | yes | *n/a* |
+------------+--------+----------+---------+
The seed url.
time_limit
----------
+-----------------------+--------+----------+---------+
| scope | type | required | default |
+=======================+========+==========+=========+
| seed-level, top-level | number | no | *none* |
+-----------------------+--------+----------+---------+
Time limit in seconds. If not specified, there no time limit. Time limit is
enforced at the seed level. If a time limit is specified at the top level, it
is inherited by each seed as described above, and enforced individually on each
seed.
proxy
-----
+-----------------------+--------+----------+---------+
| scope | type | required | default |
+=======================+========+==========+=========+
| seed-level, top-level | string | no | *none* |
+-----------------------+--------+----------+---------+
HTTP proxy, with the format ``host:port``. Typically configured to point to
warcprox for archival crawling.
enable_warcprox_features
------------------------
+-----------------------+---------+----------+---------+
| scope | type | required | default |
+=======================+=========+==========+=========+
| seed-level, top-level | boolean | no | false |
+-----------------------+---------+----------+---------+
If true for a given seed, and the seed is configured to use a proxy, enables
special features that assume the proxy is an instance of warcprox. As of this
writing, the special features that are enabled are:
- sending screenshots and thumbnails to warcprox using a WARCPROX_WRITE_RECORD
request
- sending youtube-dl metadata json to warcprox using a WARCPROX_WRITE_RECORD
request
See the warcprox docs for information on the WARCPROX_WRITE_RECORD method (XXX
not yet written).
*Note that if* ``warcprox_meta`` *and* ``proxy`` *are configured, the
Warcprox-Meta header will be sent even if* ``enable_warcprox_features`` *is not
set.*
ignore_robots
-------------
+-----------------------+---------+----------+---------+
| scope | type | required | default |
+=======================+=========+==========+=========+
| seed-level, top-level | boolean | no | false |
+-----------------------+---------+----------+---------+
If set to ``true``, brozzler will happily crawl pages that would otherwise be
blocked by robots.txt rules.
warcprox_meta
-------------
+-----------------------+------------+----------+---------+
| scope | type | required | default |
+=======================+============+==========+=========+
| seed-level, top-level | dictionary | no | false |
+-----------------------+------------+----------+---------+
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
configured. The value of the Warcprox-Meta header is a json blob. It is used to
pass settings and information to warcprox. Warcprox does not forward the header
on to the remote site. See the warcprox docs for more information (XXX not yet
written).
Brozzler takes the configured value of ``warcprox_meta``, converts it to
json and populates the Warcprox-Meta header with that value. For example::
warcprox_meta:
warc-prefix: job1-seed1
stats:
buckets:
- job1-stats
- job1-seed1-stats
becomes::
Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
scope
-----
+-----------------------+------------+----------+---------+
| scope | type | required | default |
+=======================+============+==========+=========+
| seed-level, top-level | dictionary | no | false |
+-----------------------+------------+----------+---------+
Scope rules. *TODO*

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b6.dev78',
version='1.1b6.dev87',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -53,6 +53,7 @@ setuptools.setup(
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
'brozzler-webconsole=brozzler.webconsole:main',
'brozzler-easy=brozzler.easy:main',
'brozzler-wayback=brozzler.pywb:main',
],
},
install_requires=[

1
tests/htdocs/file1.txt Normal file
View File

@ -0,0 +1 @@
I'm a plain text file.

88
tests/test_cluster.py Normal file
View File

@ -0,0 +1,88 @@
#!/usr/bin/env python
'''
cluster-integration-tests.py - integration tests for a brozzler cluster,
expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
running already
Copyright (C) 2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import pytest
import http.server
import threading
import urllib.request
import os
import socket
import rethinkstuff
@pytest.fixture(scope='module')
def httpd(request):
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
httpd = http.server.HTTPServer(
('localhost', 0), http.server.SimpleHTTPRequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
def fin():
httpd.shutdown()
httpd.server_close()
httpd_thread.join()
request.addfinalizer(fin)
return httpd
def test_httpd(httpd):
'''
Tests that our http server is working as expected, and that two fetches
of the same url return the same payload, proving it can be used to test
deduplication.
'''
payload1 = content2 = None
with urllib.request.urlopen(
'http://localhost:%s/' % httpd.server_port) as response:
assert response.status == 200
payload1 = response.read()
assert payload1
with urllib.request.urlopen(
'http://localhost:%s/' % httpd.server_port) as response:
assert response.status == 200
payload2 = response.read()
assert payload2
assert payload1 == payload2
def test_services_up():
'''Check that the expected services are up and running.'''
# check that warcprox is listening
with socket.socket() as s:
# if the connect fails an exception is raised and the test fails
s.connect(('localhost', 8000))
### # check that pywb is listening
### with socket.socket() as s:
### # if the connect fails an exception is raised and the test fails
### s.connect(('localhost', 8880))
# check that rethinkdb is listening and looks sane
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
tbls = r.table_list().run()
assert len(tbls) > 10
def test_brozzle_site(httpd):
pass

View File

@ -13,4 +13,4 @@ ansible_ssh_private_key_file=.vagrant/machines/10.9.9.9/virtualbox/private_key
10.9.9.9
[pywb]
10.9.9.9
10.9.9.9

View File

@ -24,7 +24,7 @@
roles:
- brozzler-webconsole
# - name: deploy pywb
# hosts: pywb
# roles:
# - pywb
- name: deploy pywb
hosts: pywb
roles:
- pywb

View File

@ -1,6 +1,5 @@
---
- name: install brozzler[webconsole] in virtualenv
become: true
pip: name='-e /brozzler[webconsole]'
virtualenv=/home/vagrant/brozzler-webconsole-ve34
virtualenv_python=python3.4
@ -12,4 +11,4 @@
template: src=templates/brozzler-webconsole.conf.j2
dest=/etc/init/brozzler-webconsole.conf
notify:
- restart brozzler-webconsole
- restart brozzler-webconsole

View File

@ -26,7 +26,6 @@
- ttf-indic-fonts
- fonts-thai-tlwg
- fonts-lklug-sinhala
- python3-pip
- git
- libjpeg-turbo8-dev
- zlib1g-dev
@ -49,7 +48,6 @@
notify:
- restart vnc-websock
- name: install brozzler in virtualenv
become: true
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
name='-e /brozzler'
virtualenv=/home/vagrant/brozzler-ve34

View File

@ -1,4 +1,24 @@
---
- name: ensure logs directory exists
## # get latest pip (had problems with version from apt-get, specifically
## # "pip install pyopenssl" did not install the dependency "cryptography")
## # http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
## - name: install setuptools for python 2 and 3
## become: true
## apt: name={{item}} state=present
## with_items:
## - python-setuptools
## - python3-setuptools
## - name: download pip-8.1.2.tar.gz
## get_url:
## url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz
## dest: /tmp
## checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7
## - name: extract pip-8.1.2.tar.gz
## unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no
## - name: run "python3 setup.py install" in /tmp/pip-8.1.2
## command: python3 setup.py install chdir=/tmp/pip-8.1.2
## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
## become: true
- name: mkdir /vagrant/logs
file: path=/vagrant/logs state=directory
become: true

View File

@ -0,0 +1,5 @@
---
- name: restart pywb
service: name=pywb state=restarted
become: true

View File

@ -0,0 +1,27 @@
---
- name: install pywb in virtualenv
pip: name=pywb
virtualenv=/home/vagrant/pywb-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre'
notify:
- restart pywb
- name: install brozzler in pywb virtualenv
pip: name='-e /brozzler'
virtualenv=/home/vagrant/pywb-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre'
notify:
- restart pywb
- name: pywb config file /etc/pywb.yml
template: src=templates/pywb.yml.j2
dest=/etc/pywb.yml
become: true
notify:
- restart pywb
- name: upstart config file /etc/init/pywb.conf
template: src=templates/pywb.conf.j2
dest=/etc/init/pywb.conf
become: true
notify:
- restart pywb

View File

@ -0,0 +1,14 @@
description "pywb"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH=/home/vagrant/pywb-ve34/lib/python3.4/site-packages
env PATH=/home/vagrant/pywb-ve34/bin:/usr/bin:/bin
env PYWB_CONFIG_FILE=/etc/pywb.yml
setuid vagrant
# console log
exec nice brozzler-wayback >>/vagrant/logs/pywb.log 2>&1

View File

@ -0,0 +1,12 @@
archive_paths: /vagrant/warcs/
collections:
brozzler:
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
db: brozzler
servers: [localhost]
table: captures
enable_auto_colls: false
enable_cdx_api: true
framed_replay: true
port: 8880

View File

@ -10,12 +10,14 @@
apt: name=rethinkdb state=present
become: true
notify:
- restart rethinkdb
- restart rethinkdb
# XXX rethinkdb fails to start in spite of this, I think because /vagrant
# gets mounted too late, and it tries to log there
- name: ensure rethinkdb starts on reboot
service: name=rethinkdb enabled=yes
- name: ensure rethinkdb instance config file is installed
template: src=templates/rethinkdb-brozzler-easy.conf.j2
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf
template: src=templates/rethinkdb-brozzler-vagrant-1.conf.j2
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-vagrant-1.conf
become: true
notify:
- restart rethinkdb
- restart rethinkdb

View File

@ -3,23 +3,23 @@
become: true
apt: name={{item}} state=present
with_items:
- gcc
- python-virtualenv
- python3.4
- libpython3.4-dev
- libffi-dev
- libssl-dev
- tor
- git
- gcc
- python-virtualenv
- python3.4
- libpython3.4-dev
- libffi-dev
- libssl-dev
- tor
- git
- name: install warcprox in virtualenv
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
virtualenv=/home/vagrant/warcprox-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre'
notify:
- restart warcprox
- restart warcprox
- name: install upstart config /etc/init/warcprox.conf
become: true
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
notify:
- restart warcprox
- restart warcprox

12
vagrant/run-tests.sh Executable file
View File

@ -0,0 +1,12 @@
#!/bin/bash
echo service status:
vagrant ssh -- 'status warcprox ;
status Xvnc ;
status brozzler-worker ;
status brozzler-webconsole ;
status vnc-websock'
echo
vagrant ssh -- 'source brozzler-ve34/bin/activate && pip install pytest'
vagrant ssh -- 'source brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'

View File

@ -0,0 +1,42 @@
#!/usr/bin/env python
'''
vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
queue a job for your vagrant brozzler deployment.
This is a standalone script with no dependencies other than python, and should
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
so we can use the argparse library.
'''
import sys
import os
import argparse
import subprocess
def main(argv=[]):
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
arg_parser.add_argument(
'job_conf_file', metavar='JOB_CONF_FILE',
help='brozzler job configuration file in yaml')
args = arg_parser.parse_args(args=argv[1:])
with open(args.job_conf_file, 'rb') as f:
yaml_bytes = f.read()
subprocess.call(
['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'],
stdin=yaml_bytes)
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__))
if __name__ == '__main__':
main(sys.argv)
## # cd to path with Vagrantfile so "vagrant ssh" knows what to do
## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
## cd $script_dir
##
## vagrant ssh -- \
## PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \
## /home/vagrant/brozzler-ve34/bin/python \
## /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@"

View File

@ -0,0 +1,86 @@
#!/usr/bin/env python
'''
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
queue a site for your vagrant brozzler deployment.
Fills in the --proxy option automatically. Some other options are passed
through.
This is a standalone script with no dependencies other than python, and should
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
so we can use the argparse library.
Copyright (C) 2014-2016 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
import sys
import os
import argparse
import subprocess
try:
from shlex import quote
except:
from pipes import quote
def main(argv=[]):
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
arg_parser.add_argument(
'--time-limit', dest='time_limit', default=None,
help='time limit in seconds for this site')
arg_parser.add_argument(
'--ignore-robots', dest='ignore_robots', action='store_true',
help='ignore robots.txt for this site')
arg_parser.add_argument(
'--warcprox-meta', dest='warcprox_meta',
help=(
'Warcprox-Meta http request header to send with each request; '
'must be a json blob, ignored unless warcprox features are '
'enabled'))
arg_parser.add_argument(
'-q', '--quiet', dest='quiet', action='store_true')
arg_parser.add_argument(
'-v', '--verbose', dest='verbose', action='store_true')
args = arg_parser.parse_args(args=argv[1:])
options = []
if args.time_limit:
options.append('--time-limit=%s' % args.time_limit)
if args.ignore_robots:
options.append('--ignore-robots')
if args.warcprox_meta:
# I think this shell escaping is correct?
options.append(
'--warcprox-meta=%s' % quote(args.warcprox_meta))
if args.quiet:
options.append('--quiet')
if args.verbose:
options.append('--verbose')
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
os.chdir(os.path.dirname(__file__))
cmd = (
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
'/home/vagrant/brozzler-ve34/bin/python '
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
'--proxy=localhost:8000 --enable-warcprox-features %s %s') % (
' '.join(options), args.seed)
subprocess.call(['vagrant', 'ssh', '--', cmd])
if __name__ == '__main__':
main(sys.argv)