Merge branch 'master' into qa

* master: document a bunch of job settings starting on documenting job configuration replace vagrant-brozzler-new-site with python script that fills in default options and passes through others add missing rethinkdb config file to ansible config new script runs brozzler-new-site queues a new site to brozzle on the vagrant brozzler deployment better logs for facebook logins working on including pywb in vagrant environment (not finished) starting to create a framework for testing header comment tweak new prog "brozzler-wayback" runs monkey-patched pywb
2025-04-20 23:56:34 -04:00 · 2016-09-30 08:51:17 -07:00 · 2016-09-30 08:51:17 -07:00 · 80883c9784
commit 80883c9784
parent 659d46afdd bfd4c1f8c6
23 changed files with 582 additions and 49 deletions
--- a/README.rst
+++ b/README.rst
@ -95,7 +95,8 @@ Job Configuration

 Jobs are defined using yaml files. Options may be specified either at the
 top-level or on individual seeds. A job id and at least one seed url
-must be specified, everything else is optional.
+must be specified, everything else is optional. For details, see
+`<job-conf.rst>`_.

 ::

--- a/brozzler/job.py
+++ b/brozzler/job.py
@ -1,21 +1,21 @@
-#
-# brozzler/job.py - Job class representing a brozzler crawl job, and functions
-# for setting up a job with supplied configuration
-#
-# Copyright (C) 2014-2016 Internet Archive
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
+'''
+brozzler/job.py - Job class representing a brozzler crawl job, and functions
+for setting up a job with supplied configuration
+
+Copyright (C) 2014-2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''

 import logging
 import brozzler
@ -87,7 +87,7 @@ def new_site(frontier, site):
                frontier.new_page(page)
                logging.info("queued page %s", page)
            else:
-                logging.warn("seed url {} is blocked by robots.txt".format(site.seed))
+                logging.warn("seed url %s is blocked by robots.txt", site.seed)
        finally:
            # finally block because we want to insert the Site no matter what
            frontier.new_site(site)
--- a/brozzler/pywb.py
+++ b/brozzler/pywb.py
@ -1,6 +1,7 @@
-#!/usr/bin/env python
 '''
-brozzler/pywb.py - pywb support for rethinkdb index
+brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index,
+loading from warcs still being written to, and canonicalization rules matching
+brozzler conventions

 Copyright (C) 2016 Internet Archive

@ -35,6 +36,7 @@ import rethinkstuff
 import rethinkdb
 import surt
 import json
+import brozzler

 class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
    def __init__(self, servers, db, table):
@ -192,3 +194,13 @@ def support_in_progress_warcs():
            results.append('%s.open' % warc_path)
        return results
    pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call
+
+def main(argv=sys.argv):
+    brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer()
+    brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init()
+    brozzler.pywb.support_in_progress_warcs()
+    wayback_cli = pywb.apps.cli.WaybackCli(
+            args=argv[1:], default_port=8880,
+            desc=('brozzler-wayback - pywb wayback (monkey-patched for use '
+                  'with brozzler)'))
+    wayback_cli.run()
--- a/brozzler/webconsole/static/partials/workers.html
+++ b/brozzler/webconsole/static/partials/workers.html
@ -12,11 +12,19 @@
 <div>
    <h2>Workers</h2>

+    <p><i>This page depends on some deployment details outside of brozzler
+      itself, namely that port 8901 on each brozzler-worker is running
+      websockify bridging VNC running on the same host. The vagrant+ansible
+      configuration in the brozzler repo contains an example of that.
+      https://github.com/internetarchive/brozzler/tree/master/vagrant
+    </i></p>
+
    <div class="row">
        <div class="col-xs-12 col-sm-12 col-md-6 col-lg-6" ng-repeat="worker in workers">
+            <div>{{worker}}</div>
            <div>{{worker.host}}</div>
-            <iframe style="width:45rem;height:32rem;" 
-		    ng-src="{{'/static/noVNC/vnc.html?host=' + worker.host + '&port=8901&autoconnect=1&resize=downscale'}}">
+            <iframe style="width:45rem;height:32rem;"
+              ng-src="{{'/static/noVNC/vnc.html?host=' + worker.host + '&port=8901&autoconnect=1&resize=downscale'}}">
            </iframe>
        </div>
    </div>
--- a/job-conf.rst
+++ b/job-conf.rst
@ -0,0 +1,205 @@
+brozzler job configuration
+**************************
+
+Jobs are defined using yaml files. Options may be specified either at the
+top-level or on individual seeds. A job id and at least one seed url
+must be specified, everything else is optional.
+
+an example
+==========
+
+::
+
+    id: myjob
+    time_limit: 60 # seconds
+    proxy: 127.0.0.1:8000 # point at warcprox for archiving
+    ignore_robots: false
+    enable_warcprox_features: false
+    warcprox_meta:
+      warc-prefix: job1
+      stats:
+        buckets:
+        - job1-stats
+    metadata: {}
+    seeds:
+    - url: http://one.example.org/
+      warcprox_meta:
+        warc-prefix: job1-seed1
+        stats:
+          buckets:
+          - job1-seed1-stats
+    - url: http://two.example.org/
+      time_limit: 30
+    - url: http://three.example.org/
+      time_limit: 10
+      ignore_robots: true
+      scope:
+        surt: http://(org,example,
+
+how inheritance works
+=====================
+
+Most of the available options apply to seeds. Such options can also be
+specified at the top level, in which case the seeds inherit the options. If
+an option is specified both at the top level and at the level of an individual
+seed, the results are merged with the seed-level value taking precedence in
+case of conflicts. It's probably easiest to make sense of this by way of an
+example.
+
+In the example yaml above, ``warcprox_meta`` is specified at the top level and
+at the seed level for the seed http://one.example.org/. At the top level we
+have::
+
+  warcprox_meta:
+    warc-prefix: job1
+    stats:
+      buckets:
+      - job1-stats
+
+At the seed level we have::
+
+    warcprox_meta:
+      warc-prefix: job1-seed1
+      stats:
+        buckets:
+        - job1-seed1-stats
+
+The merged configuration as applied to the seed http://one.example.org/ will
+be::
+
+    warcprox_meta:
+      warc-prefix: job1-seed1
+      stats:
+        buckets:
+        - job1-stats
+        - job1-seed1-stats
+
+Notice that:
+
+- There is a collision on ``warc-prefix`` and the seed-level value wins.
+- Since ``buckets`` is a list, the merged result includes all the values from
+  both the top level and the seed level.
+
+settings reference
+==================
+
+id
+--
+-----------+--------+----------+---------+
+| scope     | type   | required | default |
+===========+========+==========+=========+
+| top-level | string | yes?     | *n/a*   |
+-----------+--------+----------+---------+
+An arbitrary identifier for this job. Must be unique across this deployment of
+brozzler.
+
+seeds
+-----
+-----------+------------------------+----------+---------+
+| scope     | type                   | required | default |
+===========+========================+==========+=========+
+| top-level | list (of dictionaries) | yes      | *n/a*   |
+-----------+------------------------+----------+---------+
+List of seeds. Each item in the list is a dictionary (associative array) which
+defines the seed. It must specify ``url`` (see below) and can additionally
+specify any of the settings of scope *seed-level*.
+
+url
+---
+------------+--------+----------+---------+
+| scope      | type   | required | default |
+============+========+==========+=========+
+| seed-level | string | yes      | *n/a*   |
+------------+--------+----------+---------+
+The seed url.
+
+time_limit
+----------
+-----------------------+--------+----------+---------+
+| scope                 | type   | required | default |
+=======================+========+==========+=========+
+| seed-level, top-level | number | no       | *none*  |
+-----------------------+--------+----------+---------+
+Time limit in seconds. If not specified, there no time limit. Time limit is
+enforced at the seed level. If a time limit is specified at the top level, it
+is inherited by each seed as described above, and enforced individually on each
+seed.
+
+proxy
+-----
+-----------------------+--------+----------+---------+
+| scope                 | type   | required | default |
+=======================+========+==========+=========+
+| seed-level, top-level | string | no       | *none*  |
+-----------------------+--------+----------+---------+
+HTTP proxy, with the format ``host:port``. Typically configured to point to
+warcprox for archival crawling.
+
+enable_warcprox_features
+------------------------
+-----------------------+---------+----------+---------+
+| scope                 | type    | required | default |
+=======================+=========+==========+=========+
+| seed-level, top-level | boolean | no       | false   |
+-----------------------+---------+----------+---------+
+If true for a given seed, and the seed is configured to use a proxy, enables
+special features that assume the proxy is an instance of warcprox. As of this
+writing, the special features that are enabled are:
+
+- sending screenshots and thumbnails to warcprox using a WARCPROX_WRITE_RECORD
+  request
+- sending youtube-dl metadata json to warcprox using a WARCPROX_WRITE_RECORD
+  request
+
+See the warcprox docs for information on the WARCPROX_WRITE_RECORD method (XXX
+not yet written).
+
+*Note that if* ``warcprox_meta`` *and* ``proxy`` *are configured, the
+Warcprox-Meta header will be sent even if* ``enable_warcprox_features`` *is not
+set.*
+
+ignore_robots
+-------------
+-----------------------+---------+----------+---------+
+| scope                 | type    | required | default |
+=======================+=========+==========+=========+
+| seed-level, top-level | boolean | no       | false   |
+-----------------------+---------+----------+---------+
+If set to ``true``, brozzler will happily crawl pages that would otherwise be
+blocked by robots.txt rules.
+
+warcprox_meta
+-------------
+-----------------------+------------+----------+---------+
+| scope                 | type       | required | default |
+=======================+============+==========+=========+
+| seed-level, top-level | dictionary | no       | false   |
+-----------------------+------------+----------+---------+
+Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
+configured. The value of the Warcprox-Meta header is a json blob. It is used to
+pass settings and information to warcprox. Warcprox does not forward the header
+on to the remote site. See the warcprox docs for more information (XXX not yet
+written).
+
+Brozzler takes the configured value of ``warcprox_meta``, converts it to
+json and populates the Warcprox-Meta header with that value. For example::
+
+    warcprox_meta:
+      warc-prefix: job1-seed1
+      stats:
+        buckets:
+        - job1-stats
+        - job1-seed1-stats
+
+becomes::
+
+    Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
+
+scope
+-----
+-----------------------+------------+----------+---------+
+| scope                 | type       | required | default |
+=======================+============+==========+=========+
+| seed-level, top-level | dictionary | no       | false   |
+-----------------------+------------+----------+---------+
+Scope rules. *TODO*
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):

 setuptools.setup(
        name='brozzler',
-        version='1.1b6.dev78',
+        version='1.1b6.dev87',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
@ -53,6 +53,7 @@ setuptools.setup(
                'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables',
                'brozzler-webconsole=brozzler.webconsole:main',
                'brozzler-easy=brozzler.easy:main',
+                'brozzler-wayback=brozzler.pywb:main',
            ],
        },
        install_requires=[
--- a/tests/htdocs/file1.txt
+++ b/tests/htdocs/file1.txt
@ -0,0 +1 @@
+I'm a plain text file.
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -0,0 +1,88 @@
+#!/usr/bin/env python
+'''
+cluster-integration-tests.py - integration tests for a brozzler cluster,
+expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
+running already
+
+Copyright (C) 2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import pytest
+import http.server
+import threading
+import urllib.request
+import os
+import socket
+import rethinkstuff
+
+@pytest.fixture(scope='module')
+def httpd(request):
+    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
+    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
+
+    httpd = http.server.HTTPServer(
+            ('localhost', 0), http.server.SimpleHTTPRequestHandler)
+    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
+    httpd_thread.start()
+
+    def fin():
+        httpd.shutdown()
+        httpd.server_close()
+        httpd_thread.join()
+    request.addfinalizer(fin)
+
+    return httpd
+
+def test_httpd(httpd):
+    '''
+    Tests that our http server is working as expected, and that two fetches
+    of the same url return the same payload, proving it can be used to test
+    deduplication.
+    '''
+    payload1 = content2 = None
+    with urllib.request.urlopen(
+            'http://localhost:%s/' % httpd.server_port) as response:
+        assert response.status == 200
+        payload1 = response.read()
+        assert payload1
+
+    with urllib.request.urlopen(
+            'http://localhost:%s/' % httpd.server_port) as response:
+        assert response.status == 200
+        payload2 = response.read()
+        assert payload2
+
+    assert payload1 == payload2
+
+def test_services_up():
+    '''Check that the expected services are up and running.'''
+    # check that warcprox is listening
+    with socket.socket() as s:
+        # if the connect fails an exception is raised and the test fails
+        s.connect(('localhost', 8000))
+
+    ### # check that pywb is listening
+    ### with socket.socket() as s:
+    ###     # if the connect fails an exception is raised and the test fails
+    ###     s.connect(('localhost', 8880))
+
+    # check that rethinkdb is listening and looks sane
+    r = rethinkstuff.Rethinker(db='rethinkdb')  # built-in db
+    tbls = r.table_list().run()
+    assert len(tbls) > 10
+
+def test_brozzle_site(httpd):
+    pass
+
--- a/vagrant/ansible/hosts
+++ b/vagrant/ansible/hosts
@ -13,4 +13,4 @@ ansible_ssh_private_key_file=.vagrant/machines/10.9.9.9/virtualbox/private_key
 10.9.9.9

 [pywb]
-10.9.9.9
+10.9.9.9
--- a/vagrant/ansible/playbook.yml
+++ b/vagrant/ansible/playbook.yml
@ -24,7 +24,7 @@
  roles:
  - brozzler-webconsole

-# - name: deploy pywb
-#   hosts: pywb
-#   roles:
-#     - pywb
+- name: deploy pywb
+  hosts: pywb
+  roles:
+    - pywb
--- a/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml
+++ b/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml
@ -1,6 +1,5 @@
 ---
 - name: install brozzler[webconsole] in virtualenv
-  become: true
  pip: name='-e /brozzler[webconsole]'
       virtualenv=/home/vagrant/brozzler-webconsole-ve34
       virtualenv_python=python3.4
@ -12,4 +11,4 @@
  template: src=templates/brozzler-webconsole.conf.j2
            dest=/etc/init/brozzler-webconsole.conf
  notify:
-  - restart brozzler-webconsole
+  - restart brozzler-webconsole
--- a/vagrant/ansible/roles/brozzler-worker/tasks/main.yml
+++ b/vagrant/ansible/roles/brozzler-worker/tasks/main.yml
@ -26,7 +26,6 @@
    - ttf-indic-fonts
    - fonts-thai-tlwg
    - fonts-lklug-sinhala
-    - python3-pip
    - git
    - libjpeg-turbo8-dev
    - zlib1g-dev
@ -49,7 +48,6 @@
  notify:
    - restart vnc-websock
 - name: install brozzler in virtualenv
-  become: true
  pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
       name='-e /brozzler'
       virtualenv=/home/vagrant/brozzler-ve34
--- a/vagrant/ansible/roles/common/tasks/main.yml
+++ b/vagrant/ansible/roles/common/tasks/main.yml
@ -1,4 +1,24 @@
 ---
- name: ensure logs directory exists
+## # get latest pip (had problems with version from apt-get, specifically
+## # "pip install pyopenssl" did not install the dependency "cryptography")
+## # http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
+## - name: install setuptools for python 2 and 3
+##   become: true
+##   apt: name={{item}} state=present
+##   with_items:
+##   - python-setuptools
+##   - python3-setuptools
+## - name: download pip-8.1.2.tar.gz
+##   get_url:
+##     url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz
+##     dest: /tmp
+##     checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7
+## - name: extract pip-8.1.2.tar.gz
+##   unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no
+## - name: run "python3 setup.py install" in /tmp/pip-8.1.2
+##   command: python3 setup.py install chdir=/tmp/pip-8.1.2
+##            creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
+##   become: true
+- name: mkdir /vagrant/logs
  file: path=/vagrant/logs state=directory
  become: true
--- a/vagrant/ansible/roles/pywb/handlers/main.yml
+++ b/vagrant/ansible/roles/pywb/handlers/main.yml
@ -0,0 +1,5 @@
+---
+- name: restart pywb
+  service: name=pywb state=restarted
+  become: true
+
--- a/vagrant/ansible/roles/pywb/tasks/main.yml
+++ b/vagrant/ansible/roles/pywb/tasks/main.yml
@ -0,0 +1,27 @@
+---
+- name: install pywb in virtualenv
+  pip: name=pywb
+       virtualenv=/home/vagrant/pywb-ve34
+       virtualenv_python=python3.4
+       extra_args='--no-input --upgrade --pre'
+  notify:
+  - restart pywb
+- name: install brozzler in pywb virtualenv
+  pip: name='-e /brozzler'
+       virtualenv=/home/vagrant/pywb-ve34
+       virtualenv_python=python3.4
+       extra_args='--no-input --upgrade --pre'
+  notify:
+  - restart pywb
+- name: pywb config file /etc/pywb.yml
+  template: src=templates/pywb.yml.j2
+            dest=/etc/pywb.yml
+  become: true
+  notify:
+  - restart pywb
+- name: upstart config file /etc/init/pywb.conf
+  template: src=templates/pywb.conf.j2
+            dest=/etc/init/pywb.conf
+  become: true
+  notify:
+  - restart pywb
--- a/vagrant/ansible/roles/pywb/templates/pywb.conf.j2
+++ b/vagrant/ansible/roles/pywb/templates/pywb.conf.j2
@ -0,0 +1,14 @@
+description "pywb"
+
+start on runlevel [2345]
+stop on runlevel [!2345]
+
+env PYTHONPATH=/home/vagrant/pywb-ve34/lib/python3.4/site-packages
+env PATH=/home/vagrant/pywb-ve34/bin:/usr/bin:/bin
+env PYWB_CONFIG_FILE=/etc/pywb.yml
+
+setuid vagrant
+
+# console log
+
+exec nice brozzler-wayback >>/vagrant/logs/pywb.log 2>&1
--- a/vagrant/ansible/roles/pywb/templates/pywb.yml.j2
+++ b/vagrant/ansible/roles/pywb/templates/pywb.yml.j2
@ -0,0 +1,12 @@
+archive_paths: /vagrant/warcs/
+collections:
+  brozzler:
+    index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
+      db: brozzler
+      servers: [localhost]
+      table: captures
+enable_auto_colls: false
+enable_cdx_api: true
+framed_replay: true
+port: 8880
+
--- a/vagrant/ansible/roles/rethinkdb/tasks/main.yml
+++ b/vagrant/ansible/roles/rethinkdb/tasks/main.yml
@ -10,12 +10,14 @@
  apt: name=rethinkdb state=present
  become: true
  notify:
-    - restart rethinkdb
+  - restart rethinkdb
+# XXX rethinkdb fails to start in spite of this, I think because /vagrant
+# gets mounted too late, and it tries to log there
 - name: ensure rethinkdb starts on reboot
  service: name=rethinkdb enabled=yes
 - name: ensure rethinkdb instance config file is installed
-  template: src=templates/rethinkdb-brozzler-easy.conf.j2
-            dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf
+  template: src=templates/rethinkdb-brozzler-vagrant-1.conf.j2
+            dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-vagrant-1.conf
  become: true
  notify:
-    - restart rethinkdb
+  - restart rethinkdb
--- a/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2
+++ b/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2
--- a/vagrant/ansible/roles/warcprox/tasks/main.yml
+++ b/vagrant/ansible/roles/warcprox/tasks/main.yml
@ -3,23 +3,23 @@
  become: true
  apt: name={{item}} state=present
  with_items:
-    - gcc
-    - python-virtualenv
-    - python3.4
-    - libpython3.4-dev
-    - libffi-dev
-    - libssl-dev
-    - tor
-    - git
+  - gcc
+  - python-virtualenv
+  - python3.4
+  - libpython3.4-dev
+  - libffi-dev
+  - libssl-dev
+  - tor
+  - git
 - name: install warcprox in virtualenv
  pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
       virtualenv=/home/vagrant/warcprox-ve34
       virtualenv_python=python3.4
       extra_args='--no-input --upgrade --pre'
  notify:
-    - restart warcprox
+  - restart warcprox
 - name: install upstart config /etc/init/warcprox.conf
  become: true
  template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
  notify:
-    - restart warcprox
+  - restart warcprox
--- a/vagrant/run-tests.sh
+++ b/vagrant/run-tests.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+
+echo service status:
+vagrant ssh -- 'status warcprox ;
+                status Xvnc ;
+                status brozzler-worker ;
+                status brozzler-webconsole ;
+                status vnc-websock'
+echo
+
+vagrant ssh -- 'source brozzler-ve34/bin/activate && pip install pytest'
+vagrant ssh -- 'source brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
--- a/vagrant/vagrant-brozzler-new-job.py
+++ b/vagrant/vagrant-brozzler-new-job.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python
+'''
+vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to
+queue a job for your vagrant brozzler deployment.
+
+This is a standalone script with no dependencies other than python, and should
+work with python 2.7 or python 3.2+. The only reason it's not a bash script is
+so we can use the argparse library.
+'''
+
+import sys
+import os
+import argparse
+import subprocess
+
+def main(argv=[]):
+    arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
+    arg_parser.add_argument(
+            'job_conf_file', metavar='JOB_CONF_FILE',
+            help='brozzler job configuration file in yaml')
+    args = arg_parser.parse_args(args=argv[1:])
+
+    with open(args.job_conf_file, 'rb') as f:
+        yaml_bytes = f.read()
+        subprocess.call(
+                ['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'],
+                stdin=yaml_bytes)
+
+    # cd to path with Vagrantfile so "vagrant ssh" knows what to do
+    os.chdir(os.path.dirname(__file__))
+
+if __name__ == '__main__':
+    main(sys.argv)
+
+## # cd to path with Vagrantfile so "vagrant ssh" knows what to do
+## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+## cd $script_dir
+## 
+## vagrant ssh -- \
+##         PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \
+##         /home/vagrant/brozzler-ve34/bin/python \
+##         /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@"
--- a/vagrant/vagrant-brozzler-new-site.py
+++ b/vagrant/vagrant-brozzler-new-site.py
@ -0,0 +1,86 @@
+#!/usr/bin/env python
+'''
+vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to
+queue a site for your vagrant brozzler deployment.
+
+Fills in the --proxy option automatically. Some other options are passed
+through.
+
+This is a standalone script with no dependencies other than python, and should
+work with python 2.7 or python 3.2+. The only reason it's not a bash script is
+so we can use the argparse library.
+
+Copyright (C) 2014-2016 Internet Archive
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+import sys
+import os
+import argparse
+import subprocess
+try:
+    from shlex import quote
+except:
+    from pipes import quote
+
+def main(argv=[]):
+    arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0]))
+    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
+    arg_parser.add_argument(
+            '--time-limit', dest='time_limit', default=None,
+            help='time limit in seconds for this site')
+    arg_parser.add_argument(
+            '--ignore-robots', dest='ignore_robots', action='store_true',
+            help='ignore robots.txt for this site')
+    arg_parser.add_argument(
+            '--warcprox-meta', dest='warcprox_meta',
+            help=(
+                'Warcprox-Meta http request header to send with each request; '
+                'must be a json blob, ignored unless warcprox features are '
+                'enabled'))
+    arg_parser.add_argument(
+            '-q', '--quiet', dest='quiet', action='store_true')
+    arg_parser.add_argument(
+            '-v', '--verbose', dest='verbose', action='store_true')
+
+    args = arg_parser.parse_args(args=argv[1:])
+
+    options = []
+    if args.time_limit:
+        options.append('--time-limit=%s' % args.time_limit)
+    if args.ignore_robots:
+        options.append('--ignore-robots')
+    if args.warcprox_meta:
+        # I think this shell escaping is correct?
+        options.append(
+                '--warcprox-meta=%s' % quote(args.warcprox_meta))
+    if args.quiet:
+        options.append('--quiet')
+    if args.verbose:
+        options.append('--verbose')
+
+    # cd to path with Vagrantfile so "vagrant ssh" knows what to do
+    os.chdir(os.path.dirname(__file__))
+
+    cmd = (
+        'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
+        '/home/vagrant/brozzler-ve34/bin/python '
+        '/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
+        '--proxy=localhost:8000 --enable-warcprox-features %s %s') % (
+                ' '.join(options), args.seed)
+    subprocess.call(['vagrant', 'ssh', '--', cmd])
+
+if __name__ == '__main__':
+    main(sys.argv)
+