mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'master' into qa
* master: document a bunch of job settings starting on documenting job configuration replace vagrant-brozzler-new-site with python script that fills in default options and passes through others add missing rethinkdb config file to ansible config new script runs brozzler-new-site queues a new site to brozzle on the vagrant brozzler deployment better logs for facebook logins working on including pywb in vagrant environment (not finished) starting to create a framework for testing header comment tweak new prog "brozzler-wayback" runs monkey-patched pywb
This commit is contained in:
commit
80883c9784
@ -95,7 +95,8 @@ Job Configuration
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. A job id and at least one seed url
|
||||
must be specified, everything else is optional.
|
||||
must be specified, everything else is optional. For details, see
|
||||
`<job-conf.rst>`_.
|
||||
|
||||
::
|
||||
|
||||
|
@ -1,21 +1,21 @@
|
||||
#
|
||||
# brozzler/job.py - Job class representing a brozzler crawl job, and functions
|
||||
# for setting up a job with supplied configuration
|
||||
#
|
||||
# Copyright (C) 2014-2016 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
'''
|
||||
brozzler/job.py - Job class representing a brozzler crawl job, and functions
|
||||
for setting up a job with supplied configuration
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import logging
|
||||
import brozzler
|
||||
@ -87,7 +87,7 @@ def new_site(frontier, site):
|
||||
frontier.new_page(page)
|
||||
logging.info("queued page %s", page)
|
||||
else:
|
||||
logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
|
||||
logging.warn("seed url %s is blocked by robots.txt", site.seed)
|
||||
finally:
|
||||
# finally block because we want to insert the Site no matter what
|
||||
frontier.new_site(site)
|
||||
|
@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
brozzler/pywb.py - pywb support for rethinkdb index
|
||||
brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
|
||||
loading from warcs still being written to, and canonicalization rules matching
|
||||
brozzler conventions
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
@ -35,6 +36,7 @@ import rethinkstuff
|
||||
import rethinkdb
|
||||
import surt
|
||||
import json
|
||||
import brozzler
|
||||
|
||||
class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
def __init__(self, servers, db, table):
|
||||
@ -192,3 +194,13 @@ def support_in_progress_warcs():
|
||||
results.append('%s.open' % warc_path)
|
||||
return results
|
||||
pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
|
||||
|
||||
def main(argv=sys.argv):
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
|
||||
brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
|
||||
brozzler.pywb.support_in_progress_warcs()
|
||||
wayback_cli = pywb.apps.cli.WaybackCli(
|
||||
args=argv[1:], default_port=8880,
|
||||
desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
|
||||
'with brozzler)'))
|
||||
wayback_cli.run()
|
||||
|
@ -12,11 +12,19 @@
|
||||
<div>
|
||||
<h2>Workers</h2>
|
||||
|
||||
<p><i>This page depends on some deployment details outside of brozzler
|
||||
itself, namely that port 8901 on each brozzler-worker is running
|
||||
websockify bridging VNC running on the same host. The vagrant+ansible
|
||||
configuration in the brozzler repo contains an example of that.
|
||||
https://github.com/internetarchive/brozzler/tree/master/vagrant
|
||||
</i></p>
|
||||
|
||||
<div class="row">
|
||||
<div class="col-xs-12 col-sm-12 col-md-6 col-lg-6" ng-repeat="worker in workers">
|
||||
<div>{{worker}}</div>
|
||||
<div>{{worker.host}}</div>
|
||||
<iframe style="width:45rem;height:32rem;"
|
||||
ng-src="{{'/static/noVNC/vnc.html?host=' + worker.host + '&port=8901&autoconnect=1&resize=downscale'}}">
|
||||
<iframe style="width:45rem;height:32rem;"
|
||||
ng-src="{{'/static/noVNC/vnc.html?host=' + worker.host + '&port=8901&autoconnect=1&resize=downscale'}}">
|
||||
</iframe>
|
||||
</div>
|
||||
</div>
|
||||
|
205
job-conf.rst
Normal file
205
job-conf.rst
Normal file
@ -0,0 +1,205 @@
|
||||
brozzler job configuration
|
||||
**************************
|
||||
|
||||
Jobs are defined using yaml files. Options may be specified either at the
|
||||
top-level or on individual seeds. A job id and at least one seed url
|
||||
must be specified, everything else is optional.
|
||||
|
||||
an example
|
||||
==========
|
||||
|
||||
::
|
||||
|
||||
id: myjob
|
||||
time_limit: 60 # seconds
|
||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||
ignore_robots: false
|
||||
enable_warcprox_features: false
|
||||
warcprox_meta:
|
||||
warc-prefix: job1
|
||||
stats:
|
||||
buckets:
|
||||
- job1-stats
|
||||
metadata: {}
|
||||
seeds:
|
||||
- url: http://one.example.org/
|
||||
warcprox_meta:
|
||||
warc-prefix: job1-seed1
|
||||
stats:
|
||||
buckets:
|
||||
- job1-seed1-stats
|
||||
- url: http://two.example.org/
|
||||
time_limit: 30
|
||||
- url: http://three.example.org/
|
||||
time_limit: 10
|
||||
ignore_robots: true
|
||||
scope:
|
||||
surt: http://(org,example,
|
||||
|
||||
how inheritance works
|
||||
=====================
|
||||
|
||||
Most of the available options apply to seeds. Such options can also be
|
||||
specified at the top level, in which case the seeds inherit the options. If
|
||||
an option is specified both at the top level and at the level of an individual
|
||||
seed, the results are merged with the seed-level value taking precedence in
|
||||
case of conflicts. It's probably easiest to make sense of this by way of an
|
||||
example.
|
||||
|
||||
In the example yaml above, ``warcprox_meta`` is specified at the top level and
|
||||
at the seed level for the seed http://one.example.org/. At the top level we
|
||||
have::
|
||||
|
||||
warcprox_meta:
|
||||
warc-prefix: job1
|
||||
stats:
|
||||
buckets:
|
||||
- job1-stats
|
||||
|
||||
At the seed level we have::
|
||||
|
||||
warcprox_meta:
|
||||
warc-prefix: job1-seed1
|
||||
stats:
|
||||
buckets:
|
||||
- job1-seed1-stats
|
||||
|
||||
The merged configuration as applied to the seed http://one.example.org/ will
|
||||
be::
|
||||
|
||||
warcprox_meta:
|
||||
warc-prefix: job1-seed1
|
||||
stats:
|
||||
buckets:
|
||||
- job1-stats
|
||||
- job1-seed1-stats
|
||||
|
||||
Notice that:
|
||||
|
||||
- There is a collision on ``warc-prefix`` and the seed-level value wins.
|
||||
- Since ``buckets`` is a list, the merged result includes all the values from
|
||||
both the top level and the seed level.
|
||||
|
||||
settings reference
|
||||
==================
|
||||
|
||||
id
|
||||
--
|
||||
+-----------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+=========+
|
||||
| top-level | string | yes? | *n/a* |
|
||||
+-----------+--------+----------+---------+
|
||||
An arbitrary identifier for this job. Must be unique across this deployment of
|
||||
brozzler.
|
||||
|
||||
seeds
|
||||
-----
|
||||
+-----------+------------------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+===========+========================+==========+=========+
|
||||
| top-level | list (of dictionaries) | yes | *n/a* |
|
||||
+-----------+------------------------+----------+---------+
|
||||
List of seeds. Each item in the list is a dictionary (associative array) which
|
||||
defines the seed. It must specify ``url`` (see below) and can additionally
|
||||
specify any of the settings of scope *seed-level*.
|
||||
|
||||
url
|
||||
---
|
||||
+------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+============+========+==========+=========+
|
||||
| seed-level | string | yes | *n/a* |
|
||||
+------------+--------+----------+---------+
|
||||
The seed url.
|
||||
|
||||
time_limit
|
||||
----------
|
||||
+-----------------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+========+==========+=========+
|
||||
| seed-level, top-level | number | no | *none* |
|
||||
+-----------------------+--------+----------+---------+
|
||||
Time limit in seconds. If not specified, there no time limit. Time limit is
|
||||
enforced at the seed level. If a time limit is specified at the top level, it
|
||||
is inherited by each seed as described above, and enforced individually on each
|
||||
seed.
|
||||
|
||||
proxy
|
||||
-----
|
||||
+-----------------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+========+==========+=========+
|
||||
| seed-level, top-level | string | no | *none* |
|
||||
+-----------------------+--------+----------+---------+
|
||||
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
||||
warcprox for archival crawling.
|
||||
|
||||
enable_warcprox_features
|
||||
------------------------
|
||||
+-----------------------+---------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+=========+
|
||||
| seed-level, top-level | boolean | no | false |
|
||||
+-----------------------+---------+----------+---------+
|
||||
If true for a given seed, and the seed is configured to use a proxy, enables
|
||||
special features that assume the proxy is an instance of warcprox. As of this
|
||||
writing, the special features that are enabled are:
|
||||
|
||||
- sending screenshots and thumbnails to warcprox using a WARCPROX_WRITE_RECORD
|
||||
request
|
||||
- sending youtube-dl metadata json to warcprox using a WARCPROX_WRITE_RECORD
|
||||
request
|
||||
|
||||
See the warcprox docs for information on the WARCPROX_WRITE_RECORD method (XXX
|
||||
not yet written).
|
||||
|
||||
*Note that if* ``warcprox_meta`` *and* ``proxy`` *are configured, the
|
||||
Warcprox-Meta header will be sent even if* ``enable_warcprox_features`` *is not
|
||||
set.*
|
||||
|
||||
ignore_robots
|
||||
-------------
|
||||
+-----------------------+---------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+=========+
|
||||
| seed-level, top-level | boolean | no | false |
|
||||
+-----------------------+---------+----------+---------+
|
||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||
blocked by robots.txt rules.
|
||||
|
||||
warcprox_meta
|
||||
-------------
|
||||
+-----------------------+------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+=========+
|
||||
| seed-level, top-level | dictionary | no | false |
|
||||
+-----------------------+------------+----------+---------+
|
||||
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
|
||||
configured. The value of the Warcprox-Meta header is a json blob. It is used to
|
||||
pass settings and information to warcprox. Warcprox does not forward the header
|
||||
on to the remote site. See the warcprox docs for more information (XXX not yet
|
||||
written).
|
||||
|
||||
Brozzler takes the configured value of ``warcprox_meta``, converts it to
|
||||
json and populates the Warcprox-Meta header with that value. For example::
|
||||
|
||||
warcprox_meta:
|
||||
warc-prefix: job1-seed1
|
||||
stats:
|
||||
buckets:
|
||||
- job1-stats
|
||||
- job1-seed1-stats
|
||||
|
||||
becomes::
|
||||
|
||||
Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
|
||||
|
||||
scope
|
||||
-----
|
||||
+-----------------------+------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+=========+
|
||||
| seed-level, top-level | dictionary | no | false |
|
||||
+-----------------------+------------+----------+---------+
|
||||
Scope rules. *TODO*
|
3
setup.py
3
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b6.dev78',
|
||||
version='1.1b6.dev87',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -53,6 +53,7 @@ setuptools.setup(
|
||||
'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
|
||||
'brozzler-webconsole=brozzler.webconsole:main',
|
||||
'brozzler-easy=brozzler.easy:main',
|
||||
'brozzler-wayback=brozzler.pywb:main',
|
||||
],
|
||||
},
|
||||
install_requires=[
|
||||
|
1
tests/htdocs/file1.txt
Normal file
1
tests/htdocs/file1.txt
Normal file
@ -0,0 +1 @@
|
||||
I'm a plain text file.
|
88
tests/test_cluster.py
Normal file
88
tests/test_cluster.py
Normal file
@ -0,0 +1,88 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
cluster-integration-tests.py - integration tests for a brozzler cluster,
|
||||
expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
|
||||
running already
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import pytest
|
||||
import http.server
|
||||
import threading
|
||||
import urllib.request
|
||||
import os
|
||||
import socket
|
||||
import rethinkstuff
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||
|
||||
httpd = http.server.HTTPServer(
|
||||
('localhost', 0), http.server.SimpleHTTPRequestHandler)
|
||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||
httpd_thread.start()
|
||||
|
||||
def fin():
|
||||
httpd.shutdown()
|
||||
httpd.server_close()
|
||||
httpd_thread.join()
|
||||
request.addfinalizer(fin)
|
||||
|
||||
return httpd
|
||||
|
||||
def test_httpd(httpd):
|
||||
'''
|
||||
Tests that our http server is working as expected, and that two fetches
|
||||
of the same url return the same payload, proving it can be used to test
|
||||
deduplication.
|
||||
'''
|
||||
payload1 = content2 = None
|
||||
with urllib.request.urlopen(
|
||||
'http://localhost:%s/' % httpd.server_port) as response:
|
||||
assert response.status == 200
|
||||
payload1 = response.read()
|
||||
assert payload1
|
||||
|
||||
with urllib.request.urlopen(
|
||||
'http://localhost:%s/' % httpd.server_port) as response:
|
||||
assert response.status == 200
|
||||
payload2 = response.read()
|
||||
assert payload2
|
||||
|
||||
assert payload1 == payload2
|
||||
|
||||
def test_services_up():
|
||||
'''Check that the expected services are up and running.'''
|
||||
# check that warcprox is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8000))
|
||||
|
||||
### # check that pywb is listening
|
||||
### with socket.socket() as s:
|
||||
### # if the connect fails an exception is raised and the test fails
|
||||
### s.connect(('localhost', 8880))
|
||||
|
||||
# check that rethinkdb is listening and looks sane
|
||||
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
|
||||
tbls = r.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
def test_brozzle_site(httpd):
|
||||
pass
|
||||
|
@ -13,4 +13,4 @@ ansible_ssh_private_key_file=.vagrant/machines/10.9.9.9/virtualbox/private_key
|
||||
10.9.9.9
|
||||
|
||||
[pywb]
|
||||
10.9.9.9
|
||||
10.9.9.9
|
||||
|
@ -24,7 +24,7 @@
|
||||
roles:
|
||||
- brozzler-webconsole
|
||||
|
||||
# - name: deploy pywb
|
||||
# hosts: pywb
|
||||
# roles:
|
||||
# - pywb
|
||||
- name: deploy pywb
|
||||
hosts: pywb
|
||||
roles:
|
||||
- pywb
|
||||
|
@ -1,6 +1,5 @@
|
||||
---
|
||||
- name: install brozzler[webconsole] in virtualenv
|
||||
become: true
|
||||
pip: name='-e /brozzler[webconsole]'
|
||||
virtualenv=/home/vagrant/brozzler-webconsole-ve34
|
||||
virtualenv_python=python3.4
|
||||
@ -12,4 +11,4 @@
|
||||
template: src=templates/brozzler-webconsole.conf.j2
|
||||
dest=/etc/init/brozzler-webconsole.conf
|
||||
notify:
|
||||
- restart brozzler-webconsole
|
||||
- restart brozzler-webconsole
|
||||
|
@ -26,7 +26,6 @@
|
||||
- ttf-indic-fonts
|
||||
- fonts-thai-tlwg
|
||||
- fonts-lklug-sinhala
|
||||
- python3-pip
|
||||
- git
|
||||
- libjpeg-turbo8-dev
|
||||
- zlib1g-dev
|
||||
@ -49,7 +48,6 @@
|
||||
notify:
|
||||
- restart vnc-websock
|
||||
- name: install brozzler in virtualenv
|
||||
become: true
|
||||
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||
name='-e /brozzler'
|
||||
virtualenv=/home/vagrant/brozzler-ve34
|
||||
|
@ -1,4 +1,24 @@
|
||||
---
|
||||
- name: ensure logs directory exists
|
||||
## # get latest pip (had problems with version from apt-get, specifically
|
||||
## # "pip install pyopenssl" did not install the dependency "cryptography")
|
||||
## # http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
|
||||
## - name: install setuptools for python 2 and 3
|
||||
## become: true
|
||||
## apt: name={{item}} state=present
|
||||
## with_items:
|
||||
## - python-setuptools
|
||||
## - python3-setuptools
|
||||
## - name: download pip-8.1.2.tar.gz
|
||||
## get_url:
|
||||
## url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz
|
||||
## dest: /tmp
|
||||
## checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7
|
||||
## - name: extract pip-8.1.2.tar.gz
|
||||
## unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no
|
||||
## - name: run "python3 setup.py install" in /tmp/pip-8.1.2
|
||||
## command: python3 setup.py install chdir=/tmp/pip-8.1.2
|
||||
## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
|
||||
## become: true
|
||||
- name: mkdir /vagrant/logs
|
||||
file: path=/vagrant/logs state=directory
|
||||
become: true
|
||||
|
5
vagrant/ansible/roles/pywb/handlers/main.yml
Normal file
5
vagrant/ansible/roles/pywb/handlers/main.yml
Normal file
@ -0,0 +1,5 @@
|
||||
---
|
||||
- name: restart pywb
|
||||
service: name=pywb state=restarted
|
||||
become: true
|
||||
|
27
vagrant/ansible/roles/pywb/tasks/main.yml
Normal file
27
vagrant/ansible/roles/pywb/tasks/main.yml
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
- name: install pywb in virtualenv
|
||||
pip: name=pywb
|
||||
virtualenv=/home/vagrant/pywb-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
notify:
|
||||
- restart pywb
|
||||
- name: install brozzler in pywb virtualenv
|
||||
pip: name='-e /brozzler'
|
||||
virtualenv=/home/vagrant/pywb-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
notify:
|
||||
- restart pywb
|
||||
- name: pywb config file /etc/pywb.yml
|
||||
template: src=templates/pywb.yml.j2
|
||||
dest=/etc/pywb.yml
|
||||
become: true
|
||||
notify:
|
||||
- restart pywb
|
||||
- name: upstart config file /etc/init/pywb.conf
|
||||
template: src=templates/pywb.conf.j2
|
||||
dest=/etc/init/pywb.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart pywb
|
14
vagrant/ansible/roles/pywb/templates/pywb.conf.j2
Normal file
14
vagrant/ansible/roles/pywb/templates/pywb.conf.j2
Normal file
@ -0,0 +1,14 @@
|
||||
description "pywb"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH=/home/vagrant/pywb-ve34/lib/python3.4/site-packages
|
||||
env PATH=/home/vagrant/pywb-ve34/bin:/usr/bin:/bin
|
||||
env PYWB_CONFIG_FILE=/etc/pywb.yml
|
||||
|
||||
setuid vagrant
|
||||
|
||||
# console log
|
||||
|
||||
exec nice brozzler-wayback >>/vagrant/logs/pywb.log 2>&1
|
12
vagrant/ansible/roles/pywb/templates/pywb.yml.j2
Normal file
12
vagrant/ansible/roles/pywb/templates/pywb.yml.j2
Normal file
@ -0,0 +1,12 @@
|
||||
archive_paths: /vagrant/warcs/
|
||||
collections:
|
||||
brozzler:
|
||||
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
||||
db: brozzler
|
||||
servers: [localhost]
|
||||
table: captures
|
||||
enable_auto_colls: false
|
||||
enable_cdx_api: true
|
||||
framed_replay: true
|
||||
port: 8880
|
||||
|
@ -10,12 +10,14 @@
|
||||
apt: name=rethinkdb state=present
|
||||
become: true
|
||||
notify:
|
||||
- restart rethinkdb
|
||||
- restart rethinkdb
|
||||
# XXX rethinkdb fails to start in spite of this, I think because /vagrant
|
||||
# gets mounted too late, and it tries to log there
|
||||
- name: ensure rethinkdb starts on reboot
|
||||
service: name=rethinkdb enabled=yes
|
||||
- name: ensure rethinkdb instance config file is installed
|
||||
template: src=templates/rethinkdb-brozzler-easy.conf.j2
|
||||
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf
|
||||
template: src=templates/rethinkdb-brozzler-vagrant-1.conf.j2
|
||||
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-vagrant-1.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart rethinkdb
|
||||
- restart rethinkdb
|
||||
|
@ -3,23 +3,23 @@
|
||||
become: true
|
||||
apt: name={{item}} state=present
|
||||
with_items:
|
||||
- gcc
|
||||
- python-virtualenv
|
||||
- python3.4
|
||||
- libpython3.4-dev
|
||||
- libffi-dev
|
||||
- libssl-dev
|
||||
- tor
|
||||
- git
|
||||
- gcc
|
||||
- python-virtualenv
|
||||
- python3.4
|
||||
- libpython3.4-dev
|
||||
- libffi-dev
|
||||
- libssl-dev
|
||||
- tor
|
||||
- git
|
||||
- name: install warcprox in virtualenv
|
||||
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
|
||||
virtualenv=/home/vagrant/warcprox-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
notify:
|
||||
- restart warcprox
|
||||
- restart warcprox
|
||||
- name: install upstart config /etc/init/warcprox.conf
|
||||
become: true
|
||||
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
|
||||
notify:
|
||||
- restart warcprox
|
||||
- restart warcprox
|
||||
|
12
vagrant/run-tests.sh
Executable file
12
vagrant/run-tests.sh
Executable file
@ -0,0 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
echo service status:
|
||||
vagrant ssh -- 'status warcprox ;
|
||||
status Xvnc ;
|
||||
status brozzler-worker ;
|
||||
status brozzler-webconsole ;
|
||||
status vnc-websock'
|
||||
echo
|
||||
|
||||
vagrant ssh -- 'source brozzler-ve34/bin/activate && pip install pytest'
|
||||
vagrant ssh -- 'source brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
|
42
vagrant/vagrant-brozzler-new-job.py
Executable file
42
vagrant/vagrant-brozzler-new-job.py
Executable file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
|
||||
queue a job for your vagrant brozzler deployment.
|
||||
|
||||
This is a standalone script with no dependencies other than python, and should
|
||||
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||
so we can use the argparse library.
|
||||
'''
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
|
||||
def main(argv=[]):
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
||||
arg_parser.add_argument(
|
||||
'job_conf_file', metavar='JOB_CONF_FILE',
|
||||
help='brozzler job configuration file in yaml')
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
|
||||
with open(args.job_conf_file, 'rb') as f:
|
||||
yaml_bytes = f.read()
|
||||
subprocess.call(
|
||||
['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'],
|
||||
stdin=yaml_bytes)
|
||||
|
||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
|
||||
## # cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
## cd $script_dir
|
||||
##
|
||||
## vagrant ssh -- \
|
||||
## PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \
|
||||
## /home/vagrant/brozzler-ve34/bin/python \
|
||||
## /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@"
|
86
vagrant/vagrant-brozzler-new-site.py
Executable file
86
vagrant/vagrant-brozzler-new-site.py
Executable file
@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
|
||||
queue a site for your vagrant brozzler deployment.
|
||||
|
||||
Fills in the --proxy option automatically. Some other options are passed
|
||||
through.
|
||||
|
||||
This is a standalone script with no dependencies other than python, and should
|
||||
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||
so we can use the argparse library.
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import subprocess
|
||||
try:
|
||||
from shlex import quote
|
||||
except:
|
||||
from pipes import quote
|
||||
|
||||
def main(argv=[]):
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
|
||||
arg_parser.add_argument('seed', metavar='SEED', help='seed url')
|
||||
arg_parser.add_argument(
|
||||
'--time-limit', dest='time_limit', default=None,
|
||||
help='time limit in seconds for this site')
|
||||
arg_parser.add_argument(
|
||||
'--ignore-robots', dest='ignore_robots', action='store_true',
|
||||
help='ignore robots.txt for this site')
|
||||
arg_parser.add_argument(
|
||||
'--warcprox-meta', dest='warcprox_meta',
|
||||
help=(
|
||||
'Warcprox-Meta http request header to send with each request; '
|
||||
'must be a json blob, ignored unless warcprox features are '
|
||||
'enabled'))
|
||||
arg_parser.add_argument(
|
||||
'-q', '--quiet', dest='quiet', action='store_true')
|
||||
arg_parser.add_argument(
|
||||
'-v', '--verbose', dest='verbose', action='store_true')
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
|
||||
options = []
|
||||
if args.time_limit:
|
||||
options.append('--time-limit=%s' % args.time_limit)
|
||||
if args.ignore_robots:
|
||||
options.append('--ignore-robots')
|
||||
if args.warcprox_meta:
|
||||
# I think this shell escaping is correct?
|
||||
options.append(
|
||||
'--warcprox-meta=%s' % quote(args.warcprox_meta))
|
||||
if args.quiet:
|
||||
options.append('--quiet')
|
||||
if args.verbose:
|
||||
options.append('--verbose')
|
||||
|
||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
|
||||
cmd = (
|
||||
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
||||
'/home/vagrant/brozzler-ve34/bin/python '
|
||||
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
|
||||
'--proxy=localhost:8000 --enable-warcprox-features %s %s') % (
|
||||
' '.join(options), args.seed)
|
||||
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
|
Loading…
x
Reference in New Issue
Block a user