From 85073ab82bd0909914fbaa2959a73e9acf14c620 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 14 Sep 2016 17:04:01 -0700 Subject: [PATCH 01/10] new prog "brozzler-wayback" runs monkey-patched pywb --- brozzler/pywb.py | 16 ++++++++++++++-- setup.py | 3 ++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/brozzler/pywb.py b/brozzler/pywb.py index 8f1ece8..dc9072a 100644 --- a/brozzler/pywb.py +++ b/brozzler/pywb.py @@ -1,6 +1,7 @@ -#!/usr/bin/env python ''' -brozzler/pywb.py - pywb support for rethinkdb index +brozzler/pywb.py - pywb customizations for brozzler including rethinkdb index, +loading from warcs still being written to, and canonicalization rules matching +brozzler conventions Copyright (C) 2016 Internet Archive @@ -35,6 +36,7 @@ import rethinkstuff import rethinkdb import surt import json +import brozzler class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource): def __init__(self, servers, db, table): @@ -192,3 +194,13 @@ def support_in_progress_warcs(): results.append('%s.open' % warc_path) return results pywb.warc.pathresolvers.PrefixResolver.__call__ = _prefix_resolver_call + +def main(argv=sys.argv): + brozzler.pywb.TheGoodUrlCanonicalizer.replace_default_canonicalizer() + brozzler.pywb.TheGoodUrlCanonicalizer.monkey_patch_dsrules_init() + brozzler.pywb.support_in_progress_warcs() + wayback_cli = pywb.apps.cli.WaybackCli( + args=argv[1:], default_port=8880, + desc=('brozzler-wayback - pywb wayback (monkey-patched for use ' + 'with brozzler)')) + wayback_cli.run() diff --git a/setup.py b/setup.py index 036d336..efd64eb 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev78', + version='1.1b6.dev79', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -53,6 +53,7 @@ setuptools.setup( 'brozzler-ensure-tables=brozzler.cli:brozzler_ensure_tables', 'brozzler-webconsole=brozzler.webconsole:main', 'brozzler-easy=brozzler.easy:main', + 'brozzler-wayback=brozzler.pywb:main', ], }, install_requires=[ From be27b4e16e9c94aa026358554edd8a24edf9385f Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 14 Sep 2016 17:04:32 -0700 Subject: [PATCH 02/10] header comment tweak --- brozzler/job.py | 36 ++++++++++++++++++------------------ setup.py | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/brozzler/job.py b/brozzler/job.py index bfaef4d..ba259ec 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -1,21 +1,21 @@ -# -# brozzler/job.py - Job class representing a brozzler crawl job, and functions -# for setting up a job with supplied configuration -# -# Copyright (C) 2014-2016 Internet Archive -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# +''' +brozzler/job.py - Job class representing a brozzler crawl job, and functions +for setting up a job with supplied configuration + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' import logging import brozzler diff --git a/setup.py b/setup.py index efd64eb..b900d15 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev79', + version='1.1b6.dev80', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From c864499a64391c50080c5f5a49fe2a9f508984ad Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 14 Sep 2016 17:06:49 -0700 Subject: [PATCH 03/10] starting to create a framework for testing --- setup.py | 2 +- tests/htdocs/file1.txt | 1 + tests/test_cluster.py | 88 ++++++++++++++++++++++++++++++++++++++++++ vagrant/run-tests.sh | 12 ++++++ 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 tests/htdocs/file1.txt create mode 100644 tests/test_cluster.py create mode 100755 vagrant/run-tests.sh diff --git a/setup.py b/setup.py index b900d15..083a52b 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev80', + version='1.1b6.dev81', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/htdocs/file1.txt b/tests/htdocs/file1.txt new file mode 100644 index 0000000..d4a2f1c --- /dev/null +++ b/tests/htdocs/file1.txt @@ -0,0 +1 @@ +I'm a plain text file. diff --git a/tests/test_cluster.py b/tests/test_cluster.py new file mode 100644 index 0000000..7f8033d --- /dev/null +++ b/tests/test_cluster.py @@ -0,0 +1,88 @@ +#!/usr/bin/env python +''' +cluster-integration-tests.py - integration tests for a brozzler cluster, +expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be +running already + +Copyright (C) 2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import pytest +import http.server +import threading +import urllib.request +import os +import socket +import rethinkstuff + +@pytest.fixture(scope='module') +def httpd(request): + # SimpleHTTPRequestHandler always uses CWD so we have to chdir + os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) + + httpd = http.server.HTTPServer( + ('localhost', 0), http.server.SimpleHTTPRequestHandler) + httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) + httpd_thread.start() + + def fin(): + httpd.shutdown() + httpd.server_close() + httpd_thread.join() + request.addfinalizer(fin) + + return httpd + +def test_httpd(httpd): + ''' + Tests that our http server is working as expected, and that two fetches + of the same url return the same payload, proving it can be used to test + deduplication. + ''' + payload1 = content2 = None + with urllib.request.urlopen( + 'http://localhost:%s/' % httpd.server_port) as response: + assert response.status == 200 + payload1 = response.read() + assert payload1 + + with urllib.request.urlopen( + 'http://localhost:%s/' % httpd.server_port) as response: + assert response.status == 200 + payload2 = response.read() + assert payload2 + + assert payload1 == payload2 + +def test_services_up(): + '''Check that the expected services are up and running.''' + # check that warcprox is listening + with socket.socket() as s: + # if the connect fails an exception is raised and the test fails + s.connect(('localhost', 8000)) + + ### # check that pywb is listening + ### with socket.socket() as s: + ### # if the connect fails an exception is raised and the test fails + ### s.connect(('localhost', 8880)) + + # check that rethinkdb is listening and looks sane + r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db + tbls = r.table_list().run() + assert len(tbls) > 10 + +def test_brozzle_site(httpd): + pass + diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh new file mode 100755 index 0000000..42cd6f9 --- /dev/null +++ b/vagrant/run-tests.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +echo service status: +vagrant ssh -- 'status warcprox ; + status Xvnc ; + status brozzler-worker ; + status brozzler-webconsole ; + status vnc-websock' +echo + +vagrant ssh -- 'source brozzler-ve34/bin/activate && pip install pytest' +vagrant ssh -- 'source brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests' From 38af0f347b8cf6fe91af50eaf8c2c3afab871ce6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 14 Sep 2016 17:08:00 -0700 Subject: [PATCH 04/10] working on including pywb in vagrant environment (not finished) --- setup.py | 2 +- vagrant/ansible/hosts | 2 +- vagrant/ansible/playbook.yml | 8 +++--- .../roles/brozzler-webconsole/tasks/main.yml | 3 +-- .../roles/brozzler-worker/tasks/main.yml | 2 -- vagrant/ansible/roles/common/tasks/main.yml | 22 ++++++++++++++- vagrant/ansible/roles/pywb/handlers/main.yml | 5 ++++ vagrant/ansible/roles/pywb/tasks/main.yml | 27 +++++++++++++++++++ .../ansible/roles/pywb/templates/pywb.conf.j2 | 14 ++++++++++ .../ansible/roles/pywb/templates/pywb.yml.j2 | 12 +++++++++ .../ansible/roles/rethinkdb/tasks/main.yml | 10 ++++--- .../templates/rethinkdb-brozzler-easy.conf.j2 | 5 ---- vagrant/ansible/roles/warcprox/tasks/main.yml | 20 +++++++------- 13 files changed, 102 insertions(+), 30 deletions(-) create mode 100644 vagrant/ansible/roles/pywb/handlers/main.yml create mode 100644 vagrant/ansible/roles/pywb/tasks/main.yml create mode 100644 vagrant/ansible/roles/pywb/templates/pywb.conf.j2 create mode 100644 vagrant/ansible/roles/pywb/templates/pywb.yml.j2 delete mode 100644 vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 diff --git a/setup.py b/setup.py index 083a52b..d37f14a 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev81', + version='1.1b6.dev82', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/ansible/hosts b/vagrant/ansible/hosts index 708a07a..0257aeb 100644 --- a/vagrant/ansible/hosts +++ b/vagrant/ansible/hosts @@ -13,4 +13,4 @@ ansible_ssh_private_key_file=.vagrant/machines/10.9.9.9/virtualbox/private_key 10.9.9.9 [pywb] -10.9.9.9 \ No newline at end of file +10.9.9.9 diff --git a/vagrant/ansible/playbook.yml b/vagrant/ansible/playbook.yml index fda30b8..f55a4b1 100644 --- a/vagrant/ansible/playbook.yml +++ b/vagrant/ansible/playbook.yml @@ -24,7 +24,7 @@ roles: - brozzler-webconsole -# - name: deploy pywb -# hosts: pywb -# roles: -# - pywb +- name: deploy pywb + hosts: pywb + roles: + - pywb diff --git a/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml b/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml index f0f70d8..1d7194a 100644 --- a/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml +++ b/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml @@ -1,6 +1,5 @@ --- - name: install brozzler[webconsole] in virtualenv - become: true pip: name='-e /brozzler[webconsole]' virtualenv=/home/vagrant/brozzler-webconsole-ve34 virtualenv_python=python3.4 @@ -12,4 +11,4 @@ template: src=templates/brozzler-webconsole.conf.j2 dest=/etc/init/brozzler-webconsole.conf notify: - - restart brozzler-webconsole \ No newline at end of file + - restart brozzler-webconsole diff --git a/vagrant/ansible/roles/brozzler-worker/tasks/main.yml b/vagrant/ansible/roles/brozzler-worker/tasks/main.yml index a4ec194..7dad56a 100644 --- a/vagrant/ansible/roles/brozzler-worker/tasks/main.yml +++ b/vagrant/ansible/roles/brozzler-worker/tasks/main.yml @@ -26,7 +26,6 @@ - ttf-indic-fonts - fonts-thai-tlwg - fonts-lklug-sinhala - - python3-pip - git - libjpeg-turbo8-dev - zlib1g-dev @@ -49,7 +48,6 @@ notify: - restart vnc-websock - name: install brozzler in virtualenv - become: true pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler name='-e /brozzler' virtualenv=/home/vagrant/brozzler-ve34 diff --git a/vagrant/ansible/roles/common/tasks/main.yml b/vagrant/ansible/roles/common/tasks/main.yml index f9012ca..9a14357 100644 --- a/vagrant/ansible/roles/common/tasks/main.yml +++ b/vagrant/ansible/roles/common/tasks/main.yml @@ -1,4 +1,24 @@ --- -- name: ensure logs directory exists +## # get latest pip (had problems with version from apt-get, specifically +## # "pip install pyopenssl" did not install the dependency "cryptography") +## # http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure +## - name: install setuptools for python 2 and 3 +## become: true +## apt: name={{item}} state=present +## with_items: +## - python-setuptools +## - python3-setuptools +## - name: download pip-8.1.2.tar.gz +## get_url: +## url: https://pypi.python.org/packages/e7/a8/7556133689add8d1a54c0b14aeff0acb03c64707ce100ecd53934da1aa13/pip-8.1.2.tar.gz +## dest: /tmp +## checksum: sha1:1c13c247967ec5bee6de5fd104c5d78ba30951c7 +## - name: extract pip-8.1.2.tar.gz +## unarchive: src=/tmp/pip-8.1.2.tar.gz dest=/tmp copy=no +## - name: run "python3 setup.py install" in /tmp/pip-8.1.2 +## command: python3 setup.py install chdir=/tmp/pip-8.1.2 +## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py +## become: true +- name: mkdir /vagrant/logs file: path=/vagrant/logs state=directory become: true diff --git a/vagrant/ansible/roles/pywb/handlers/main.yml b/vagrant/ansible/roles/pywb/handlers/main.yml new file mode 100644 index 0000000..4424b3e --- /dev/null +++ b/vagrant/ansible/roles/pywb/handlers/main.yml @@ -0,0 +1,5 @@ +--- +- name: restart pywb + service: name=pywb state=restarted + become: true + diff --git a/vagrant/ansible/roles/pywb/tasks/main.yml b/vagrant/ansible/roles/pywb/tasks/main.yml new file mode 100644 index 0000000..08fac19 --- /dev/null +++ b/vagrant/ansible/roles/pywb/tasks/main.yml @@ -0,0 +1,27 @@ +--- +- name: install pywb in virtualenv + pip: name=pywb + virtualenv=/home/vagrant/pywb-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' + notify: + - restart pywb +- name: install brozzler in pywb virtualenv + pip: name='-e /brozzler' + virtualenv=/home/vagrant/pywb-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' + notify: + - restart pywb +- name: pywb config file /etc/pywb.yml + template: src=templates/pywb.yml.j2 + dest=/etc/pywb.yml + become: true + notify: + - restart pywb +- name: upstart config file /etc/init/pywb.conf + template: src=templates/pywb.conf.j2 + dest=/etc/init/pywb.conf + become: true + notify: + - restart pywb diff --git a/vagrant/ansible/roles/pywb/templates/pywb.conf.j2 b/vagrant/ansible/roles/pywb/templates/pywb.conf.j2 new file mode 100644 index 0000000..5b2887e --- /dev/null +++ b/vagrant/ansible/roles/pywb/templates/pywb.conf.j2 @@ -0,0 +1,14 @@ +description "pywb" + +start on runlevel [2345] +stop on runlevel [!2345] + +env PYTHONPATH=/home/vagrant/pywb-ve34/lib/python3.4/site-packages +env PATH=/home/vagrant/pywb-ve34/bin:/usr/bin:/bin +env PYWB_CONFIG_FILE=/etc/pywb.yml + +setuid vagrant + +# console log + +exec nice brozzler-wayback >>/vagrant/logs/pywb.log 2>&1 diff --git a/vagrant/ansible/roles/pywb/templates/pywb.yml.j2 b/vagrant/ansible/roles/pywb/templates/pywb.yml.j2 new file mode 100644 index 0000000..c17281c --- /dev/null +++ b/vagrant/ansible/roles/pywb/templates/pywb.yml.j2 @@ -0,0 +1,12 @@ +archive_paths: /vagrant/warcs/ +collections: + brozzler: + index_paths: !!python/object:brozzler.pywb.RethinkCDXSource + db: brozzler + servers: [localhost] + table: captures +enable_auto_colls: false +enable_cdx_api: true +framed_replay: true +port: 8880 + diff --git a/vagrant/ansible/roles/rethinkdb/tasks/main.yml b/vagrant/ansible/roles/rethinkdb/tasks/main.yml index 77bbb89..7f40ff4 100644 --- a/vagrant/ansible/roles/rethinkdb/tasks/main.yml +++ b/vagrant/ansible/roles/rethinkdb/tasks/main.yml @@ -10,12 +10,14 @@ apt: name=rethinkdb state=present become: true notify: - - restart rethinkdb + - restart rethinkdb +# XXX rethinkdb fails to start in spite of this, I think because /vagrant +# gets mounted too late, and it tries to log there - name: ensure rethinkdb starts on reboot service: name=rethinkdb enabled=yes - name: ensure rethinkdb instance config file is installed - template: src=templates/rethinkdb-brozzler-easy.conf.j2 - dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf + template: src=templates/rethinkdb-brozzler-vagrant-1.conf.j2 + dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-vagrant-1.conf become: true notify: - - restart rethinkdb + - restart rethinkdb diff --git a/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 b/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 deleted file mode 100644 index 62b3ac5..0000000 --- a/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 +++ /dev/null @@ -1,5 +0,0 @@ -runuser=vagrant -bind=0.0.0.0 -# directory=/var/lib/rethinkdb -# log-file=/var/log/rethinkdb.log -log-file=/vagrant/logs/rethinkdb.log # synced dir diff --git a/vagrant/ansible/roles/warcprox/tasks/main.yml b/vagrant/ansible/roles/warcprox/tasks/main.yml index c9f611d..7a9c7d0 100644 --- a/vagrant/ansible/roles/warcprox/tasks/main.yml +++ b/vagrant/ansible/roles/warcprox/tasks/main.yml @@ -3,23 +3,23 @@ become: true apt: name={{item}} state=present with_items: - - gcc - - python-virtualenv - - python3.4 - - libpython3.4-dev - - libffi-dev - - libssl-dev - - tor - - git + - gcc + - python-virtualenv + - python3.4 + - libpython3.4-dev + - libffi-dev + - libssl-dev + - tor + - git - name: install warcprox in virtualenv pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox virtualenv=/home/vagrant/warcprox-ve34 virtualenv_python=python3.4 extra_args='--no-input --upgrade --pre' notify: - - restart warcprox + - restart warcprox - name: install upstart config /etc/init/warcprox.conf become: true template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf notify: - - restart warcprox + - restart warcprox From 8f44eac2f3c1b519fd3d315e6d0778529d44c830 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 16 Sep 2016 15:42:19 -0700 Subject: [PATCH 05/10] better logs for facebook logins --- brozzler/behaviors.d/facebook.js.template | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/brozzler/behaviors.d/facebook.js.template b/brozzler/behaviors.d/facebook.js.template index 543bad7..3a551b3 100644 --- a/brozzler/behaviors.d/facebook.js.template +++ b/brozzler/behaviors.d/facebook.js.template @@ -198,10 +198,12 @@ var umbraBehaviorFinished = function() { return false; } + if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters + console.log("missing login_form or login credentials; maybe already logged in?") var umbraIntervalId = setInterval(umbraIntervalFunc, 200); } -else //login +else {//login + console.log("#login_form and credentials found for " + location.href); umbraFacebookLogin(); - - +} From 253122d061286ec8950889023aa73b4d8c4ff284 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 16 Sep 2016 16:35:44 -0700 Subject: [PATCH 06/10] new script runs brozzler-new-site queues a new site to brozzle on the vagrant brozzler deployment --- setup.py | 2 +- vagrant/vagrant-brozzler-new-site.sh | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100755 vagrant/vagrant-brozzler-new-site.sh diff --git a/setup.py b/setup.py index d37f14a..6725b3f 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev82', + version='1.1b6.dev83', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/vagrant-brozzler-new-site.sh b/vagrant/vagrant-brozzler-new-site.sh new file mode 100755 index 0000000..bf45648 --- /dev/null +++ b/vagrant/vagrant-brozzler-new-site.sh @@ -0,0 +1,14 @@ +#!/bin/bash +# +# vagrant-brozzler-new-site.sh - run brozzler-new-site inside the vagrant vm to +# queue a job for your vagrant brozzler deployment +# + +# cd to path with Vagrantfile so "vagrant ssh" knows what to do +script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd $script_dir + +vagrant ssh -- \ + PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \ + /home/vagrant/brozzler-ve34/bin/python \ + /home/vagrant/brozzler-ve34/bin/brozzler-new-site "$@" From cc9517cb45e6fa83c8a30a5d47e47d73e69e7114 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Sep 2016 01:45:28 +0100 Subject: [PATCH 07/10] add missing rethinkdb config file to ansible config --- setup.py | 2 +- .../rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2 | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2 diff --git a/setup.py b/setup.py index 6725b3f..ff1df78 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev83', + version='1.1b6.dev84', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2 b/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2 new file mode 100644 index 0000000..62b3ac5 --- /dev/null +++ b/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-vagrant-1.conf.j2 @@ -0,0 +1,5 @@ +runuser=vagrant +bind=0.0.0.0 +# directory=/var/lib/rethinkdb +# log-file=/var/log/rethinkdb.log +log-file=/vagrant/logs/rethinkdb.log # synced dir From 2462efc4edf5639ecd107a6ce58e207d9ca8cf2c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Sep 2016 01:47:23 +0100 Subject: [PATCH 08/10] replace vagrant-brozzler-new-site with python script that fills in default options and passes through others --- setup.py | 2 +- vagrant/vagrant-brozzler-new-site.py | 86 ++++++++++++++++++++++++++++ vagrant/vagrant-brozzler-new-site.sh | 14 ----- 3 files changed, 87 insertions(+), 15 deletions(-) create mode 100755 vagrant/vagrant-brozzler-new-site.py delete mode 100755 vagrant/vagrant-brozzler-new-site.sh diff --git a/setup.py b/setup.py index ff1df78..de5c821 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev84', + version='1.1b6.dev85', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py new file mode 100755 index 0000000..7e073aa --- /dev/null +++ b/vagrant/vagrant-brozzler-new-site.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +''' +vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to +queue a site for your vagrant brozzler deployment. + +Fills in the --proxy option automatically. some other options are passed +through. + +This is a standalone script with no dependencies other than python, and should +work with python 2.7 or python 3.2+. The only reason it's not a bash script is +so we can use the argparse library. + +Copyright (C) 2014-2016 Internet Archive + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +''' + +import sys +import os +import argparse +import subprocess +try: + from shlex import quote +except: + from pipes import quote + +def main(argv=[]): + arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) + arg_parser.add_argument('seed', metavar='SEED', help='seed url') + arg_parser.add_argument( + '--time-limit', dest='time_limit', default=None, + help='time limit in seconds for this site') + arg_parser.add_argument( + '--ignore-robots', dest='ignore_robots', action='store_true', + help='ignore robots.txt for this site') + arg_parser.add_argument( + '--warcprox-meta', dest='warcprox_meta', + help=( + 'Warcprox-Meta http request header to send with each request; ' + 'must be a json blob, ignored unless warcprox features are ' + 'enabled')) + arg_parser.add_argument( + '-q', '--quiet', dest='quiet', action='store_true') + arg_parser.add_argument( + '-v', '--verbose', dest='verbose', action='store_true') + + args = arg_parser.parse_args(args=argv[1:]) + + options = [] + if args.time_limit: + options.append('--time-limit=%s' % args.time_limit) + if args.ignore_robots: + options.append('--ignore-robots') + if args.warcprox_meta: + # I think this shell escaping is correct? + options.append( + '--warcprox-meta=%s' % quote(args.warcprox_meta)) + if args.quiet: + options.append('--quiet') + if args.verbose: + options.append('--verbose') + + # cd to path with Vagrantfile so "vagrant ssh" knows what to do + os.chdir(os.path.dirname(__file__)) + + cmd = ( + 'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' + '/home/vagrant/brozzler-ve34/bin/python ' + '/home/vagrant/brozzler-ve34/bin/brozzler-new-site ' + '--proxy=localhost:8000 --enable-warcprox-features %s %s') % ( + ' '.join(options), args.seed) + subprocess.call(['vagrant', 'ssh', '--', cmd]) + +if __name__ == '__main__': + main(sys.argv) + diff --git a/vagrant/vagrant-brozzler-new-site.sh b/vagrant/vagrant-brozzler-new-site.sh deleted file mode 100755 index bf45648..0000000 --- a/vagrant/vagrant-brozzler-new-site.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -# -# vagrant-brozzler-new-site.sh - run brozzler-new-site inside the vagrant vm to -# queue a job for your vagrant brozzler deployment -# - -# cd to path with Vagrantfile so "vagrant ssh" knows what to do -script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" -cd $script_dir - -vagrant ssh -- \ - PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \ - /home/vagrant/brozzler-ve34/bin/python \ - /home/vagrant/brozzler-ve34/bin/brozzler-new-site "$@" From 8c9a9c566644de7cd09b54aa8fb1703d20116cd7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Sep 2016 12:03:16 -0700 Subject: [PATCH 09/10] starting on documenting job configuration --- README.rst | 3 +- brozzler/job.py | 2 +- .../webconsole/static/partials/workers.html | 12 ++- job-conf.rst | 81 +++++++++++++++++++ setup.py | 2 +- vagrant/vagrant-brozzler-new-job.py | 42 ++++++++++ vagrant/vagrant-brozzler-new-site.py | 2 +- 7 files changed, 138 insertions(+), 6 deletions(-) create mode 100644 job-conf.rst create mode 100755 vagrant/vagrant-brozzler-new-job.py diff --git a/README.rst b/README.rst index 2b47792..3e53867 100644 --- a/README.rst +++ b/README.rst @@ -95,7 +95,8 @@ Job Configuration Jobs are defined using yaml files. Options may be specified either at the top-level or on individual seeds. A job id and at least one seed url -must be specified, everything else is optional. +must be specified, everything else is optional. For details, see +``_. :: diff --git a/brozzler/job.py b/brozzler/job.py index ba259ec..85e955d 100644 --- a/brozzler/job.py +++ b/brozzler/job.py @@ -87,7 +87,7 @@ def new_site(frontier, site): frontier.new_page(page) logging.info("queued page %s", page) else: - logging.warn("seed url {} is blocked by robots.txt".format(site.seed)) + logging.warn("seed url %s is blocked by robots.txt", site.seed) finally: # finally block because we want to insert the Site no matter what frontier.new_site(site) diff --git a/brozzler/webconsole/static/partials/workers.html b/brozzler/webconsole/static/partials/workers.html index 61f9a61..5f39c77 100644 --- a/brozzler/webconsole/static/partials/workers.html +++ b/brozzler/webconsole/static/partials/workers.html @@ -12,11 +12,19 @@

Workers

+

This page depends on some deployment details outside of brozzler + itself, namely that port 8901 on each brozzler-worker is running + websockify bridging VNC running on the same host. The vagrant+ansible + configuration in the brozzler repo contains an example of that. + https://github.com/internetarchive/brozzler/tree/master/vagrant +

+
+
{{worker}}
{{worker.host}}
-
diff --git a/job-conf.rst b/job-conf.rst new file mode 100644 index 0000000..a073bed --- /dev/null +++ b/job-conf.rst @@ -0,0 +1,81 @@ +brozzler job configuration +========================== + +Jobs are defined using yaml files. Options may be specified either at the +top-level or on individual seeds. A job id and at least one seed url +must be specified, everything else is optional. + +an example +---------- + +:: + + id: myjob + time_limit: 60 # seconds + proxy: 127.0.0.1:8000 # point at warcprox for archiving + ignore_robots: false + enable_warcprox_features: false + warcprox_meta: + warc-prefix: job1 + stats: + buckets: + - job1-stats + metadata: {} + seeds: + - url: http://one.example.org/ + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-seed1-stats + - url: http://two.example.org/ + time_limit: 30 + - url: http://three.example.org/ + time_limit: 10 + ignore_robots: true + scope: + surt: http://(org,example, + +how inheritance works +--------------------- + +Most of the available options apply to seeds. Such options can also be +specified at the top level, in which case the seeds inherit the options. If +an option is specified both at the top level and at the level of an individual +seed, the results are merged with the seed-level value taking precedence in +case of conflicts. It's probably easiest to make sense of this by way of an +example. + +In the example yaml above, ``warcprox_meta`` is specified at the top level and +at the seed level for the seed http://one.example.org/. At the top level we +have:: + + warcprox_meta: + warc-prefix: job1 + stats: + buckets: + - job1-stats + +At the seed level we have:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-seed1-stats + +The merged configuration as applied to the seed http://one.example.org/ will +be:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-stats + - job1-seed1-stats + +Notice that: + +- There is a collision on ``warc-prefix`` and the seed-level value wins. +- Since ``buckets`` is a list, the merged result includes all the values from + both the top level and the seed level. diff --git a/setup.py b/setup.py index de5c821..047a2e2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev85', + version='1.1b6.dev86', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/vagrant-brozzler-new-job.py b/vagrant/vagrant-brozzler-new-job.py new file mode 100755 index 0000000..767091b --- /dev/null +++ b/vagrant/vagrant-brozzler-new-job.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +''' +vagrant-brozzler-new-job.py - runs brozzler-new-job inside the vagrant vm to +queue a job for your vagrant brozzler deployment. + +This is a standalone script with no dependencies other than python, and should +work with python 2.7 or python 3.2+. The only reason it's not a bash script is +so we can use the argparse library. +''' + +import sys +import os +import argparse +import subprocess + +def main(argv=[]): + arg_parser = argparse.ArgumentParser(prog=os.path.basename(argv[0])) + arg_parser.add_argument( + 'job_conf_file', metavar='JOB_CONF_FILE', + help='brozzler job configuration file in yaml') + args = arg_parser.parse_args(args=argv[1:]) + + with open(args.job_conf_file, 'rb') as f: + yaml_bytes = f.read() + subprocess.call( + ['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'], + stdin=yaml_bytes) + + # cd to path with Vagrantfile so "vagrant ssh" knows what to do + os.chdir(os.path.dirname(__file__)) + +if __name__ == '__main__': + main(sys.argv) + +## # cd to path with Vagrantfile so "vagrant ssh" knows what to do +## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +## cd $script_dir +## +## vagrant ssh -- \ +## PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \ +## /home/vagrant/brozzler-ve34/bin/python \ +## /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@" diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index 7e073aa..9986fef 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -3,7 +3,7 @@ vagrant-brozzler-new-site.py - runs brozzler-new-site inside the vagrant vm to queue a site for your vagrant brozzler deployment. -Fills in the --proxy option automatically. some other options are passed +Fills in the --proxy option automatically. Some other options are passed through. This is a standalone script with no dependencies other than python, and should From bfd4c1f8c6ebc77826e8af1e9db3f041c5b6445e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 29 Sep 2016 16:15:44 -0700 Subject: [PATCH 10/10] document a bunch of job settings --- job-conf.rst | 130 +++++++++++++++++++++++++++++++++++++++++++++++++-- setup.py | 2 +- 2 files changed, 128 insertions(+), 4 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index a073bed..056c7ca 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -1,12 +1,12 @@ brozzler job configuration -========================== +************************** Jobs are defined using yaml files. Options may be specified either at the top-level or on individual seeds. A job id and at least one seed url must be specified, everything else is optional. an example ----------- +========== :: @@ -37,7 +37,7 @@ an example surt: http://(org,example, how inheritance works ---------------------- +===================== Most of the available options apply to seeds. Such options can also be specified at the top level, in which case the seeds inherit the options. If @@ -79,3 +79,127 @@ Notice that: - There is a collision on ``warc-prefix`` and the seed-level value wins. - Since ``buckets`` is a list, the merged result includes all the values from both the top level and the seed level. + +settings reference +================== + +id +-- ++-----------+--------+----------+---------+ +| scope | type | required | default | ++===========+========+==========+=========+ +| top-level | string | yes? | *n/a* | ++-----------+--------+----------+---------+ +An arbitrary identifier for this job. Must be unique across this deployment of +brozzler. + +seeds +----- ++-----------+------------------------+----------+---------+ +| scope | type | required | default | ++===========+========================+==========+=========+ +| top-level | list (of dictionaries) | yes | *n/a* | ++-----------+------------------------+----------+---------+ +List of seeds. Each item in the list is a dictionary (associative array) which +defines the seed. It must specify ``url`` (see below) and can additionally +specify any of the settings of scope *seed-level*. + +url +--- ++------------+--------+----------+---------+ +| scope | type | required | default | ++============+========+==========+=========+ +| seed-level | string | yes | *n/a* | ++------------+--------+----------+---------+ +The seed url. + +time_limit +---------- ++-----------------------+--------+----------+---------+ +| scope | type | required | default | ++=======================+========+==========+=========+ +| seed-level, top-level | number | no | *none* | ++-----------------------+--------+----------+---------+ +Time limit in seconds. If not specified, there no time limit. Time limit is +enforced at the seed level. If a time limit is specified at the top level, it +is inherited by each seed as described above, and enforced individually on each +seed. + +proxy +----- ++-----------------------+--------+----------+---------+ +| scope | type | required | default | ++=======================+========+==========+=========+ +| seed-level, top-level | string | no | *none* | ++-----------------------+--------+----------+---------+ +HTTP proxy, with the format ``host:port``. Typically configured to point to +warcprox for archival crawling. + +enable_warcprox_features +------------------------ ++-----------------------+---------+----------+---------+ +| scope | type | required | default | ++=======================+=========+==========+=========+ +| seed-level, top-level | boolean | no | false | ++-----------------------+---------+----------+---------+ +If true for a given seed, and the seed is configured to use a proxy, enables +special features that assume the proxy is an instance of warcprox. As of this +writing, the special features that are enabled are: + +- sending screenshots and thumbnails to warcprox using a WARCPROX_WRITE_RECORD + request +- sending youtube-dl metadata json to warcprox using a WARCPROX_WRITE_RECORD + request + +See the warcprox docs for information on the WARCPROX_WRITE_RECORD method (XXX +not yet written). + +*Note that if* ``warcprox_meta`` *and* ``proxy`` *are configured, the +Warcprox-Meta header will be sent even if* ``enable_warcprox_features`` *is not +set.* + +ignore_robots +------------- ++-----------------------+---------+----------+---------+ +| scope | type | required | default | ++=======================+=========+==========+=========+ +| seed-level, top-level | boolean | no | false | ++-----------------------+---------+----------+---------+ +If set to ``true``, brozzler will happily crawl pages that would otherwise be +blocked by robots.txt rules. + +warcprox_meta +------------- ++-----------------------+------------+----------+---------+ +| scope | type | required | default | ++=======================+============+==========+=========+ +| seed-level, top-level | dictionary | no | false | ++-----------------------+------------+----------+---------+ +Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is +configured. The value of the Warcprox-Meta header is a json blob. It is used to +pass settings and information to warcprox. Warcprox does not forward the header +on to the remote site. See the warcprox docs for more information (XXX not yet +written). + +Brozzler takes the configured value of ``warcprox_meta``, converts it to +json and populates the Warcprox-Meta header with that value. For example:: + + warcprox_meta: + warc-prefix: job1-seed1 + stats: + buckets: + - job1-stats + - job1-seed1-stats + +becomes:: + + Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}} + +scope +----- ++-----------------------+------------+----------+---------+ +| scope | type | required | default | ++=======================+============+==========+=========+ +| seed-level, top-level | dictionary | no | false | ++-----------------------+------------+----------+---------+ +Scope rules. *TODO* diff --git a/setup.py b/setup.py index 047a2e2..912a829 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b6.dev86', + version='1.1b6.dev87', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',