diff --git a/README.rst b/README.rst index 2b47792..3e53867 100644 --- a/README.rst +++ b/README.rst @@ -95,7 +95,8 @@ Job Configuration Jobs are defined using yaml files. Options may be specified either at the top-level or on individual seeds. A job id and at least one seed url -must be specified, everything else is optional. +must be specified, everything else is optional. For details, see +``_. :: diff --git a/brozzler/job.py b/brozzler/job.py index bfaef4d..85e955d 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -1,21 +1,21 @@ -# -# brozzler/job.py - Job class representing a brozzler crawl job, and functions -# for setting up a job with supplied configuration -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# +''' +brozzler/job.py - Job class representing a brozzler crawl job, and functions +for setting up a job with supplied configuration + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' import logging import brozzler @@ -87,7 +87,7 @@ def new_site(frontier, site): frontier.new_page(page) logging.info("queued page %s", page) else: - logging.warn("seed url {} is blocked by robots.txt".format(site.seed)) + logging.warn("seed url %s is blocked by robots.txt", site.seed) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 8f1ece8..dc9072a 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -1,6 +1,7 @@ -#!/usr/bin/env python ''' -brozzler/pywb.py - pywb support for rethinkdb index +brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index, +loading from warcs still being written to, and canonicalization rules matching +brozzler conventions Copyright (C) 2016 Internet Archive @@ -35,6 +36,7 @@ import rethinkstuff import rethinkdb import surt import json +import brozzler class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): def __init__(self, servers, db, table): @@ -192,3 +194,13 @@ def support_in_progress_warcs(): results.append('%s.open' % warc_path) return results pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call + +def main(argv=sys.argv): + brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() + brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() + brozzler.pywb.support_in_progress_warcs() + wayback_cli = pywb.apps.cli.WaybackCli( + args=argv[1:], default_port=8880, + desc=('brozzler-wayback - pywb wayback (monkey-patched for use ' + 'with brozzler)')) + wayback_cli.run() diff --git a/brozzler/webconsole/static/partials/workers.html b/brozzler/webconsole/static/partials/workers.html index 61f9a61..5f39c77 100644 --- a/brozzler/webconsole/static/partials/workers.html +++ b/brozzler/webconsole/static/partials/workers.html @@ -12,11 +12,19 @@

Workers

+

This page depends on some deployment details outside of brozzler + itself, namely that port 8901 on each brozzler-worker is running + websockify bridging VNC running on the same host. The vagrant+ansible + configuration in the brozzler repo contains an example of that. + https://github.com/internetarchive/brozzler/tree/master/vagrant +

+
+
{{worker}}
{{worker.host}}
-
diff --git a/job-conf.rst b/job-conf.rst new file mode 100644 index 0000000..056c7ca --- /dev/null +++ b/job-conf.rst @@ -0,0 +1,205 @@ +brozzler job configuration +************************** + +Jobs are defined using yaml files. Options may be specified either at the +top-level or on individual seeds. A job id and at least one seed url +must be specified, everything else is optional. + +an example +========== + +:: + + id: myjob + time_limit: 60 # seconds + proxy: 127.0.0.1:8000 # point at warcprox for archiving + ignore_robots: false + enable_warcprox_features: false + warcprox_meta: + warc-prefix: job1 + stats: + buckets: + - job1-stats + metadata: {} + seeds: + - url: http://one.example.org/ + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-seed1-stats + - url: http://two.example.org/ + time_limit: 30 + - url: http://three.example.org/ + time_limit: 10 + ignore_robots: true + scope: + surt: http://(org,example, + +how inheritance works +===================== + +Most of the available options apply to seeds. Such options can also be +specified at the top level, in which case the seeds inherit the options. If +an option is specified both at the top level and at the level of an individual +seed, the results are merged with the seed-level value taking precedence in +case of conflicts. It's probably easiest to make sense of this by way of an +example. + +In the example yaml above, ``warcprox_meta`` is specified at the top level and +at the seed level for the seed http://one.example.org/. At the top level we +have:: + + warcprox_meta: + warc-prefix: job1 + stats: + buckets: + - job1-stats + +At the seed level we have:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-seed1-stats + +The merged configuration as applied to the seed http://one.example.org/ will +be:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-stats + - job1-seed1-stats + +Notice that: + +- There is a collision on ``warc-prefix`` and the seed-level value wins. +- Since ``buckets`` is a list, the merged result includes all the values from + both the top level and the seed level. + +settings reference +================== + +id +-- ++-----------+--------+----------+---------+ +| scope | type | required | default | ++===========+========+==========+=========+ +| top-level | string | yes? | *n/a* | ++-----------+--------+----------+---------+ +An arbitrary identifier for this job. Must be unique across this deployment of +brozzler. + +seeds +----- ++-----------+------------------------+----------+---------+ +| scope | type | required | default | ++===========+========================+==========+=========+ +| top-level | list (of dictionaries) | yes | *n/a* | ++-----------+------------------------+----------+---------+ +List of seeds. Each item in the list is a dictionary (associative array) which +defines the seed. It must specify ``url`` (see below) and can additionally +specify any of the settings of scope *seed-level*. + +url +--- ++------------+--------+----------+---------+ +| scope | type | required | default | ++============+========+==========+=========+ +| seed-level | string | yes | *n/a* | ++------------+--------+----------+---------+ +The seed url. + +time_limit +---------- ++-----------------------+--------+----------+---------+ +| scope | type | required | default | ++=======================+========+==========+=========+ +| seed-level, top-level | number | no | *none* | ++-----------------------+--------+----------+---------+ +Time limit in seconds. If not specified, there no time limit. Time limit is +enforced at the seed level. If a time limit is specified at the top level, it +is inherited by each seed as described above, and enforced individually on each +seed. + +proxy +----- ++-----------------------+--------+----------+---------+ +| scope | type | required | default | ++=======================+========+==========+=========+ +| seed-level, top-level | string | no | *none* | ++-----------------------+--------+----------+---------+ +HTTP proxy, with the format ``host:port``. Typically configured to point to +warcprox for archival crawling. + +enable_warcprox_features +------------------------ ++-----------------------+---------+----------+---------+ +| scope | type | required | default | ++=======================+=========+==========+=========+ +| seed-level, top-level | boolean | no | false | ++-----------------------+---------+----------+---------+ +If true for a given seed, and the seed is configured to use a proxy, enables +special features that assume the proxy is an instance of warcprox. As of this +writing, the special features that are enabled are: + +- sending screenshots and thumbnails to warcprox using a WARCPROX_WRITE_RECORD + request +- sending youtube-dl metadata json to warcprox using a WARCPROX_WRITE_RECORD + request + +See the warcprox docs for information on the WARCPROX_WRITE_RECORD method (XXX +not yet written). + +*Note that if* ``warcprox_meta`` *and* ``proxy`` *are configured, the +Warcprox-Meta header will be sent even if* ``enable_warcprox_features`` *is not +set.* + +ignore_robots +------------- ++-----------------------+---------+----------+---------+ +| scope | type | required | default | ++=======================+=========+==========+=========+ +| seed-level, top-level | boolean | no | false | ++-----------------------+---------+----------+---------+ +If set to ``true``, brozzler will happily crawl pages that would otherwise be +blocked by robots.txt rules. + +warcprox_meta +------------- ++-----------------------+------------+----------+---------+ +| scope | type | required | default | ++=======================+============+==========+=========+ +| seed-level, top-level | dictionary | no | false | ++-----------------------+------------+----------+---------+ +Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is +configured. The value of the Warcprox-Meta header is a json blob. It is used to +pass settings and information to warcprox. Warcprox does not forward the header +on to the remote site. See the warcprox docs for more information (XXX not yet +written). + +Brozzler takes the configured value of ``warcprox_meta``, converts it to +json and populates the Warcprox-Meta header with that value. For example:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-stats + - job1-seed1-stats + +becomes:: + + Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}} + +scope +----- ++-----------------------+------------+----------+---------+ +| scope | type | required | default | ++=======================+============+==========+=========+ +| seed-level, top-level | dictionary | no | false | ++-----------------------+------------+----------+---------+ +Scope rules. *TODO* diff --git a/setup.py b/setup.py index 036d336..912a829 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev78', + version='1.1b6.dev87', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -53,6 +53,7 @@ setuptools.setup( 'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', 'brozzler-webconsole=brozzler.webconsole:main', 'brozzler-easy=brozzler.easy:main', + 'brozzler-wayback=brozzler.pywb:main', ], }, install_requires=[ diff --git a/tests/htdocs/file1.txt b/tests/htdocs/file1.txt new file mode 100644 index 0000000..d4a2f1c --- /dev/null +++ b/tests/htdocs/file1.txt @@ -0,0 +1 @@ +I'm a plain text file. diff --git a/tests/test_cluster.py b/tests/test_cluster.py new file mode 100644 index 0000000..7f8033d --- /dev/null +++ b/tests/test_cluster.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +''' +cluster-integration-tests.py - integration tests for a brozzler cluster, +expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be +running already + +Copyright (C) 2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import pytest +import http.server +import threading +import urllib.request +import os +import socket +import rethinkstuff + +@pytest.fixture(scope='module') +def httpd(request): + # SimpleHTTPRequestHandler always uses CWD so we have to chdir + os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + + httpd = http.server.HTTPServer( + ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + def fin(): + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + request.addfinalizer(fin) + + return httpd + +def test_httpd(httpd): + ''' + Tests that our http server is working as expected, and that two fetches + of the same url return the same payload, proving it can be used to test + deduplication. + ''' + payload1 = content2 = None + with urllib.request.urlopen( + 'http://localhost:%s/' % httpd.server_port) as response: + assert response.status == 200 + payload1 = response.read() + assert payload1 + + with urllib.request.urlopen( + 'http://localhost:%s/' % httpd.server_port) as response: + assert response.status == 200 + payload2 = response.read() + assert payload2 + + assert payload1 == payload2 + +def test_services_up(): + '''Check that the expected services are up and running.''' + # check that warcprox is listening + with socket.socket() as s: + # if the connect fails an exception is raised and the test fails + s.connect(('localhost', 8000)) + + ### # check that pywb is listening + ### with socket.socket() as s: + ### # if the connect fails an exception is raised and the test fails + ### s.connect(('localhost', 8880)) + + # check that rethinkdb is listening and looks sane + r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db + tbls = r.table_list().run() + assert len(tbls) > 10 + +def test_brozzle_site(httpd): + pass + diff --git a/vagrant/ansible/hosts b/vagrant/ansible/hosts index 708a07a..0257aeb 100644 --- a/vagrant/ansible/hosts +++ b/vagrant/ansible/hosts @@ -13,4 +13,4 @@ ansible_ssh_private_key_file=.vagrant/machines/10.9.9.9/virtualbox/private_key 10.9.9.9 [pywb] -10.9.9.9 \ No newline at end of file +10.9.9.9 diff --git a/vagrant/ansible/playbook.yml b/vagrant/ansible/playbook.yml index fda30b8..f55a4b1 100644 --- a/vagrant/ansible/playbook.yml +++ b/vagrant/ansible/playbook.yml @@ -24,7 +24,7 @@ roles: - brozzler-webconsole -# - name: deploy pywb -# hosts: pywb -# roles: -# - pywb +- name: deploy pywb + hosts: pywb + roles: + - pywb diff --git a/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml b/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml index f0f70d8..1d7194a 100644 --- a/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml +++ b/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml @@ -1,6 +1,5 @@ --- - name: install brozzler[webconsole] in virtualenv - become: true pip: name='-e /brozzler[webconsole]' virtualenv=/home/vagrant/brozzler-webconsole-ve34 virtualenv_python=python3.4 @@ -12,4 +11,4 @@ template: src=templates/brozzler-webconsole.conf.j2 dest=/etc/init/brozzler-webconsole.conf notify: - - restart brozzler-webconsole \ No newline at end of file + - restart brozzler-webconsole diff --git a/vagrant/ansible/roles/brozzler-worker/tasks/main.yml b/vagrant/ansible/roles/brozzler-worker/tasks/main.yml index a4ec194..7dad56a 100644 --- a/vagrant/ansible/roles/brozzler-worker/tasks/main.yml +++ b/vagrant/ansible/roles/brozzler-worker/tasks/main.yml @@ -26,7 +26,6 @@ - ttf-indic-fonts - fonts-thai-tlwg - fonts-lklug-sinhala - - python3-pip - git - libjpeg-turbo8-dev - zlib1g-dev @@ -49,7 +48,6 @@ notify: - restart vnc-websock - name: install brozzler in virtualenv - become: true pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler name='-e /brozzler' virtualenv=/home/vagrant/brozzler-ve34 diff --git a/vagrant/ansible/roles/common/tasks/main.yml b/vagrant/ansible/roles/common/tasks/main.yml index f9012ca..9a14357 100644 --- a/vagrant/ansible/roles/common/tasks/main.yml +++ b/vagrant/ansible/roles/common/tasks/main.yml @@ -1,4 +1,24 @@ --- -- name: ensure logs directory exists +## # get latest pip (had problems with version from apt-get, specifically +## # "pip install pyopenssl" did not install the dependency "cryptography") +## # http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure +## - name: install setuptools for python 2 and 3 +## become: true +## apt: name={{item}} state=present +## with_items: +## - python-setuptools +## - python3-setuptools +## - name: download pip-8.1.2.tar.gz +## get_url: +## url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz +## dest: /tmp +## checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7 +## - name: extract pip-8.1.2.tar.gz +## unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no +## - name: run "python3 setup.py install" in /tmp/pip-8.1.2 +## command: python3 setup.py install chdir=/tmp/pip-8.1.2 +## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py +## become: true +- name: mkdir /vagrant/logs file: path=/vagrant/logs state=directory become: true diff --git a/vagrant/ansible/roles/pywb/handlers/main.yml b/vagrant/ansible/roles/pywb/handlers/main.yml new file mode 100644 index 0000000..4424b3e --- /dev/null +++ b/vagrant/ansible/roles/pywb/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: restart pywb + service: name=pywb state=restarted + become: true + diff --git a/vagrant/ansible/roles/pywb/tasks/main.yml b/vagrant/ansible/roles/pywb/tasks/main.yml new file mode 100644 index 0000000..08fac19 --- /dev/null +++ b/vagrant/ansible/roles/pywb/tasks/main.yml @@ -0,0 +1,27 @@ +--- +- name: install pywb in virtualenv + pip: name=pywb + virtualenv=/home/vagrant/pywb-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' + notify: + - restart pywb +- name: install brozzler in pywb virtualenv + pip: name='-e /brozzler' + virtualenv=/home/vagrant/pywb-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' + notify: + - restart pywb +- name: pywb config file /etc/pywb.yml + template: src=templates/pywb.yml.j2 + dest=/etc/pywb.yml + become: true + notify: + - restart pywb +- name: upstart config file /etc/init/pywb.conf + template: src=templates/pywb.conf.j2 + dest=/etc/init/pywb.conf + become: true + notify: + - restart pywb diff --git a/vagrant/ansible/roles/pywb/templates/pywb.conf.j2 b/vagrant/ansible/roles/pywb/templates/pywb.conf.j2 new file mode 100644 index 0000000..5b2887e --- /dev/null +++ b/vagrant/ansible/roles/pywb/templates/pywb.conf.j2 @@ -0,0 +1,14 @@ +description "pywb" + +start on runlevel [2345] +stop on runlevel [!2345] + +env PYTHONPATH=/home/vagrant/pywb-ve34/lib/python3.4/site-packages +env PATH=/home/vagrant/pywb-ve34/bin:/usr/bin:/bin +env PYWB_CONFIG_FILE=/etc/pywb.yml + +setuid vagrant + +# console log + +exec nice brozzler-wayback >>/vagrant/logs/pywb.log 2>&1 diff --git a/vagrant/ansible/roles/pywb/templates/pywb.yml.j2 b/vagrant/ansible/roles/pywb/templates/pywb.yml.j2 new file mode 100644 index 0000000..c17281c --- /dev/null +++ b/vagrant/ansible/roles/pywb/templates/pywb.yml.j2 @@ -0,0 +1,12 @@ +archive_paths: /vagrant/warcs/ +collections: + brozzler: + index_paths: !!python/object:brozzler.pywb.RethinkCDXSource + db: brozzler + servers: [localhost] + table: captures +enable_auto_colls: false +enable_cdx_api: true +framed_replay: true +port: 8880 + diff --git a/vagrant/ansible/roles/rethinkdb/tasks/main.yml b/vagrant/ansible/roles/rethinkdb/tasks/main.yml index 77bbb89..7f40ff4 100644 --- a/vagrant/ansible/roles/rethinkdb/tasks/main.yml +++ b/vagrant/ansible/roles/rethinkdb/tasks/main.yml @@ -10,12 +10,14 @@ apt: name=rethinkdb state=present become: true notify: - - restart rethinkdb + - restart rethinkdb +# XXX rethinkdb fails to start in spite of this, I think because /vagrant +# gets mounted too late, and it tries to log there - name: ensure rethinkdb starts on reboot service: name=rethinkdb enabled=yes - name: ensure rethinkdb instance config file is installed - template: src=templates/rethinkdb-brozzler-easy.conf.j2 - dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf + template: src=templates/rethinkdb-brozzler-vagrant-1.conf.j2 + dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-vagrant-1.conf become: true notify: - - restart rethinkdb + - restart rethinkdb diff --git a/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 b/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2 similarity index 100% rename from vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 rename to vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2 diff --git a/vagrant/ansible/roles/warcprox/tasks/main.yml b/vagrant/ansible/roles/warcprox/tasks/main.yml index c9f611d..7a9c7d0 100644 --- a/vagrant/ansible/roles/warcprox/tasks/main.yml +++ b/vagrant/ansible/roles/warcprox/tasks/main.yml @@ -3,23 +3,23 @@ become: true apt: name={{item}} state=present with_items: - - gcc - - python-virtualenv - - python3.4 - - libpython3.4-dev - - libffi-dev - - libssl-dev - - tor - - git + - gcc + - python-virtualenv + - python3.4 + - libpython3.4-dev + - libffi-dev + - libssl-dev + - tor + - git - name: install warcprox in virtualenv pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox virtualenv=/home/vagrant/warcprox-ve34 virtualenv_python=python3.4 extra_args='--no-input --upgrade --pre' notify: - - restart warcprox + - restart warcprox - name: install upstart config /etc/init/warcprox.conf become: true template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf notify: - - restart warcprox + - restart warcprox diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh new file mode 100755 index 0000000..42cd6f9 --- /dev/null +++ b/vagrant/run-tests.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +echo service status: +vagrant ssh -- 'status warcprox ; + status Xvnc ; + status brozzler-worker ; + status brozzler-webconsole ; + status vnc-websock' +echo + +vagrant ssh -- 'source brozzler-ve34/bin/activate && pip install pytest' +vagrant ssh -- 'source brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests' diff --git a/vagrant/vagrant-brozzler-new-job.py b/vagrant/vagrant-brozzler-new-job.py new file mode 100755 index 0000000..767091b --- /dev/null +++ b/vagrant/vagrant-brozzler-new-job.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +''' +vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to +queue a job for your vagrant brozzler deployment. + +This is a standalone script with no dependencies other than python, and should +work with python 2.7 or python 3.2+. The only reason it's not a bash script is +so we can use the argparse library. +''' + +import sys +import os +import argparse +import subprocess + +def main(argv=[]): + arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) + arg_parser.add_argument( + 'job_conf_file', metavar='JOB_CONF_FILE', + help='brozzler job configuration file in yaml') + args = arg_parser.parse_args(args=argv[1:]) + + with open(args.job_conf_file, 'rb') as f: + yaml_bytes = f.read() + subprocess.call( + ['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'], + stdin=yaml_bytes) + + # cd to path with Vagrantfile so "vagrant ssh" knows what to do + os.chdir(os.path.dirname(__file__)) + +if __name__ == '__main__': + main(sys.argv) + +## # cd to path with Vagrantfile so "vagrant ssh" knows what to do +## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +## cd $script_dir +## +## vagrant ssh -- \ +## PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \ +## /home/vagrant/brozzler-ve34/bin/python \ +## /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@" diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py new file mode 100755 index 0000000..9986fef --- /dev/null +++ b/vagrant/vagrant-brozzler-new-site.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +''' +vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to +queue a site for your vagrant brozzler deployment. + +Fills in the --proxy option automatically. Some other options are passed +through. + +This is a standalone script with no dependencies other than python, and should +work with python 2.7 or python 3.2+. The only reason it's not a bash script is +so we can use the argparse library. + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import sys +import os +import argparse +import subprocess +try: + from shlex import quote +except: + from pipes import quote + +def main(argv=[]): + arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) + arg_parser.add_argument('seed', metavar='SEED', help='seed url') + arg_parser.add_argument( + '--time-limit', dest='time_limit', default=None, + help='time limit in seconds for this site') + arg_parser.add_argument( + '--ignore-robots', dest='ignore_robots', action='store_true', + help='ignore robots.txt for this site') + arg_parser.add_argument( + '--warcprox-meta', dest='warcprox_meta', + help=( + 'Warcprox-Meta http request header to send with each request; ' + 'must be a json blob, ignored unless warcprox features are ' + 'enabled')) + arg_parser.add_argument( + '-q', '--quiet', dest='quiet', action='store_true') + arg_parser.add_argument( + '-v', '--verbose', dest='verbose', action='store_true') + + args = arg_parser.parse_args(args=argv[1:]) + + options = [] + if args.time_limit: + options.append('--time-limit=%s' % args.time_limit) + if args.ignore_robots: + options.append('--ignore-robots') + if args.warcprox_meta: + # I think this shell escaping is correct? + options.append( + '--warcprox-meta=%s' % quote(args.warcprox_meta)) + if args.quiet: + options.append('--quiet') + if args.verbose: + options.append('--verbose') + + # cd to path with Vagrantfile so "vagrant ssh" knows what to do + os.chdir(os.path.dirname(__file__)) + + cmd = ( + 'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' + '/home/vagrant/brozzler-ve34/bin/python ' + '/home/vagrant/brozzler-ve34/bin/brozzler-new-site ' + '--proxy=localhost:8000 --enable-warcprox-features %s %s') % ( + ' '.join(options), args.seed) + subprocess.call(['vagrant', 'ssh', '--', cmd]) + +if __name__ == '__main__': + main(sys.argv) +