Merge branch 'typos' into qa

This commit is contained in:
Barbara Miller 2019-05-17 17:24:19 -07:00
commit 4ada3e01b7
40 changed files with 469 additions and 320 deletions

View file

@ -11,19 +11,22 @@ before_install:
- sudo pip install ansible==2.1.3.0 - sudo pip install ansible==2.1.3.0
install: install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest - pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest==4.3.0
- chromium-browser --version - chromium-browser --version
- sudo apt-get update - sudo apt-get update
- sudo apt-get install --only-upgrade chromium-browser - sudo apt-get install --only-upgrade chromium-browser
- chromium-browser --version - chromium-browser --version
- sudo service brozzler-worker restart - ps ww -fHe
- sudo cat /var/log/Xvnc.log
- sudo cat /var/log/brozzler-worker.log
- sudo cat /var/log/warcprox.log
script: script:
- DISPLAY=:1 py.test --tb=native -v tests - DISPLAY=:1 py.test --tb=native -v tests
after_failure: after_failure:
- chromium-browser --version - chromium-browser --version
- sudo cat /var/log/upstart/warcprox.log - sudo cat /var/log/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log - sudo cat /var/log/brozzler-worker.log
- sudo cat /var/log/upstart/pywb.log - sudo cat /var/log/pywb.log
notifications: notifications:
slack: slack:
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs= secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=

View file

@ -1,7 +1,9 @@
[all:vars] [all:vars]
warcs_dir=/vagrant/warcs warcs_dir=/vagrant/warcs
brozzler_pip_name='-e /brozzler' # brozzler_pip_name='-e /brozzler' # not working anymore? :(
brozzler_pip_name='/brozzler'
user=vagrant user=vagrant
ansible_python_interpreter=/usr/bin/python3
### possible values for a prod deployment ### possible values for a prod deployment
# brozzler_pip_name=brozzler # get it from pypi # brozzler_pip_name=brozzler # get it from pypi
# brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler # brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler

View file

@ -1,4 +1,8 @@
--- ---
- name: restart brozzler-dashboard - name: restart brozzler-dashboard
service: name=brozzler-dashboard state=restarted svc:
name: brozzler-dashboard
state: restarted
service_dir: /etc/service
become: true become: true

View file

@ -1,20 +1,33 @@
--- ---
- name: mkdir {{venv_root}}/brozzler-dashboard-ve34 - name: mkdir {{venv_root}}/brozzler-dashboard-ve3
file: path={{venv_root}}/brozzler-dashboard-ve34 state=directory file: path={{venv_root}}/brozzler-dashboard-ve3 state=directory
owner={{user}} owner={{user}}
become: true become: true
- name: install brozzler[dashboard] in virtualenv - name: install brozzler[dashboard] in virtualenv
pip: name='{{brozzler_pip_name}}[dashboard]' pip:
virtualenv={{venv_root}}/brozzler-dashboard-ve34 name: '{{brozzler_pip_name}}[dashboard]'
virtualenv_python=python3.4 virtualenv: '{{venv_root}}/brozzler-dashboard-ve3'
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true become: true
become_user: '{{user}}' become_user: '{{user}}'
notify: notify:
- restart brozzler-dashboard - restart brozzler-dashboard
- name: install upstart config /etc/init/brozzler-dashboard.conf
- name: mkdir /etc/service/brozzler-dashboard
file:
path: /etc/service/brozzler-dashboard
state: directory
become: true become: true
template: src=templates/brozzler-dashboard.conf.j2
dest=/etc/init/brozzler-dashboard.conf - name: install /etc/service/brozzler-dashboard/run
template:
src: templates/brozzler-dashboard-run.j2
dest: /etc/service/brozzler-dashboard/run
mode: 0755
notify: notify:
- restart brozzler-dashboard - restart brozzler-dashboard
become: true

View file

@ -0,0 +1,15 @@
#!/bin/bash
logfile=/var/log/brozzler-dashboard.log
touch $logfile
chown {{user}} $logfile
source /opt/brozzler-dashboard-ve3/bin/activate
exec nice setuidgid {{user}} \
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler \
RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} \
RETHINKDB_DB=brozzler LANG=en_US.UTF-8 LC_COLLATE=C \
gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app \
>> $logfile 2>&1

View file

@ -1,18 +0,0 @@
description "brozzler-dashboard"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH={{venv_root}}/brozzler-dashboard-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/brozzler-dashboard-ve34/bin:/usr/bin:/bin
env LC_ALL=C.UTF-8
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
env RETHINKDB_DB=brozzler
setuid {{user}}
console log
exec gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app

View file

@ -1,13 +1,22 @@
--- ---
- name: restart Xvnc - name: restart Xvnc
service: name=Xvnc state=restarted svc:
become: true name: Xvnc
- name: restart websockify state: restarted
service: name=websockify state=restarted service_dir: /etc/service
become: true become: true
- name: restart vnc-websock - name: restart vnc-websock
service: name=vnc-websock state=restarted svc:
name: vnc-websock
state: restarted
service_dir: /etc/service
become: true become: true
- name: restart brozzler-worker - name: restart brozzler-worker
service: name=brozzler-worker state=restarted svc:
name: brozzler-worker
state: restarted
service_dir: /etc/service
become: true become: true

View file

@ -3,14 +3,22 @@
apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner' apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
state=present state=present
become: true become: true
- apt: update_cache=yes - apt: update_cache=yes
become: true become: true
- name: ensure required packages are installed - name: ensure required packages are installed
become: true become: true
apt: name={{item}} state=present apt: name={{item}} state=present
with_items: with_items:
- vnc4server
- chromium-browser - chromium-browser
- vnc4server
- libjpeg-turbo8-dev
- zlib1g-dev
- gcc
- python3-dev
- python3-dbg
- adobe-flashplugin
- xfonts-base - xfonts-base
- fonts-arphic-bkai00mp - fonts-arphic-bkai00mp
- fonts-arphic-bsmi00lp - fonts-arphic-bsmi00lp
@ -24,51 +32,74 @@
- fonts-sil-padauk - fonts-sil-padauk
- fonts-unfonts-extra - fonts-unfonts-extra
- fonts-unfonts-core - fonts-unfonts-core
- ttf-indic-fonts - fonts-indic
- fonts-thai-tlwg - fonts-thai-tlwg
- fonts-lklug-sinhala - fonts-lklug-sinhala
- git
- libjpeg-turbo8-dev - name: mkdir /etc/service/{Xvnc,vnc-websock,brozzler-worker}
- zlib1g-dev file:
- gcc path: '/etc/service/{{item}}'
- g++ state: directory
- libpython3.4-dev with_items:
- adobe-flashplugin - Xvnc
- name: install Xvnc upstart config /etc/init/Xvnc.conf - vnc-websock
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf - brozzler-worker
become: true become: true
- name: install /etc/service/Xvnc/run
template:
src: templates/Xvnc-run.j2
dest: /etc/service/Xvnc/run
mode: 0755
notify: notify:
- restart Xvnc - restart Xvnc
- name: mkdir {{venv_root}}/websockify-ve34
become: true become: true
file: path={{venv_root}}/websockify-ve34 state=directory owner={{user}}
- name: mkdir {{venv_root}}/websockify-ve3
become: true
file: path={{venv_root}}/websockify-ve3 state=directory owner={{user}}
- name: install websockify in virtualenv - name: install websockify in virtualenv
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify pip:
virtualenv={{venv_root}}/websockify-ve34 name: git+https://github.com/kanaka/websockify.git#egg=websockify
virtualenv_python=python3.4 virtualenv: '{{venv_root}}/websockify-ve3'
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true become: true
become_user: '{{user}}' become_user: '{{user}}'
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf - name: install /etc/service/vnc-websock/run
become: true template:
src: templates/vnc-websock-run.j2
dest: /etc/service/vnc-websock/run
mode: 0755
notify: notify:
- restart vnc-websock - restart vnc-websock
- name: mkdir {{venv_root}}/brozzler-ve34
become: true become: true
file: path={{venv_root}}/brozzler-ve34 state=directory owner={{user}}
- name: mkdir {{venv_root}}/brozzler-ve3
become: true
file: path={{venv_root}}/brozzler-ve3 state=directory owner={{user}}
- name: install brozzler in virtualenv - name: install brozzler in virtualenv
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler pip:
name='{{brozzler_pip_name}}' name: '{{brozzler_pip_name}}'
virtualenv={{venv_root}}/brozzler-ve34 virtualenv: '{{venv_root}}/brozzler-ve3'
virtualenv_python=python3.4 virtualenv_python: python3
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true become: true
become_user: '{{user}}' become_user: '{{user}}'
notify: notify:
- restart brozzler-worker - restart brozzler-worker
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf - name: install /etc/service/brozzler-worker/run
become: true template:
src: templates/brozzler-worker-run.j2
dest: /etc/service/brozzler-worker/run
mode: 0755
notify: notify:
- restart brozzler-worker - restart brozzler-worker
become: true

View file

@ -0,0 +1,14 @@
#!/bin/bash
cd /tmp
logfile=/var/log/Xvnc.log
touch $logfile
chown {{user}} $logfile
exec nice setuidgid {{user}} Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 \
>> $logfile 2>&1

View file

@ -1,14 +0,0 @@
description "Xvnc"
start on runlevel [2345]
stop on runlevel [!2345]
setuid {{user}}
console log
exec nice Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0

View file

@ -0,0 +1,17 @@
#!/bin/bash
logfile=/var/log/brozzler-worker.log
touch $logfile
chown {{user}} $logfile
source {{venv_root}}/brozzler-ve3/bin/activate
exec nice setuidgid {{user}} \
env DISPLAY=:1 LANG=en_US.UTF-8 LC_COLLATE=C \
brozzler-worker \
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
--max-browsers=4 \
--trace \
--warcprox-auto \
>> $logfile 2>&1

View file

@ -1,25 +0,0 @@
description "brozzler-worker"
start on runlevel [2345]
stop on runlevel [!2345]
env DISPLAY=:1
env PATH={{venv_root}}/brozzler-ve34/bin:/usr/bin:/bin
env PYTHONPATH={{venv_root}}/brozzler-ve34/lib/python3.4/site-packages
env LANG=C.UTF-8
setuid {{user}}
console log
# depends on vnc server
start on started Xvnc
stop on stopping Xvnc
kill timeout 60
exec nice brozzler-worker \
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
--max-browsers=4 \
--verbose \
--warcprox-auto

View file

@ -0,0 +1,10 @@
#!/bin/bash
logfile=/var/log/vnc-websock.log
touch $logfile
chown {{user}} $logfile
source /opt/websockify-ve3/bin/activate
exec nice setuidgid {{user}} websockify 0.0.0.0:8901 localhost:5901 >> $logfile 2>&1

View file

@ -1,15 +0,0 @@
description "vnc-websock"
start on runlevel [2345]
stop on runlevel [!2345]
setuid {{user}}
console log
env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin
# port 8901 is hard-coded in brozzler/dashboard/static/partials/workers.html
exec nice websockify 0.0.0.0:8901 localhost:5901

View file

@ -1,44 +1,74 @@
--- ---
# get latest pip (had problems with version from apt-get, specifically - apt:
# "pip install pyopenssl" did not install the dependency "cryptography") name:
# http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
- name: install setuptools for python 2 and 3
become: true
apt: name={{item}} state=present
with_items:
- python-setuptools
- python3-setuptools - python3-setuptools
- name: download pip-9.0.1.tar.gz - python3-pip
get_url: - python3-virtualenv
url: https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz - daemontools
dest: /tmp - daemontools-run
checksum: sha1:57ff41e99cb01b6a1c2b0999161589b726f0ec8b state: present
- name: extract pip-9.0.1.tar.gz update_cache: yes
unarchive: src=/tmp/pip-9.0.1.tar.gz dest=/tmp copy=no cache_valid_time: 86400 # one day
become: true
# # get recent virtualenv, which bundles a recent pip
# - find:
# paths:
# - /usr/local/lib/python3.4/dist-packages
# - /usr/local/lib/python3.5/dist-packages
# recurse: true
# patterns: virtualenv.py
# contains: '__version__ = "16.4.3"'
# register: virtualenv_py_16_4_3
#
# - command: mktemp -d
# register: mktempd_out
# when: virtualenv_py_16_4_3.matched == 0
#
# - name: download virtualenv-16.4.3
# get_url:
# url: https://files.pythonhosted.org/packages/37/db/89d6b043b22052109da35416abc3c397655e4bd3cff031446ba02b9654fa/virtualenv-16.4.3.tar.gz
# dest: '{{mktempd_out.stdout}}'
# checksum: sha256:984d7e607b0a5d1329425dd8845bd971b957424b5ba664729fab51ab8c11bc39
# when: virtualenv_py_16_4_3.matched == 0
#
# - name: extract virtualenv-16.4.3.tar.gz
# unarchive:
# src: '{{mktempd_out.stdout}}/virtualenv-16.4.3.tar.gz'
# dest: '{{mktempd_out.stdout}}'
# copy: no
# when: virtualenv_py_16_4_3.matched == 0
#
# - name: run "python3 setup.py install" in {{mktempd_out.stdout}}/virtualenv-16.4.3
# become: true
# command: python3 setup.py install
# args:
# chdir: '{{mktempd_out.stdout}}/virtualenv-16.4.3'
# when: virtualenv_py_16_4_3.matched == 0
#
# - file:
# path: '{{mktempd_out.stdout}}'
# state: absent
# become: true
# when: virtualenv_py_16_4_3.matched == 0
# this clause is a workaround for travis-ci, which only wants to install in /usr # this clause is a workaround for travis-ci, which only wants to install in /usr
# see https://travis-ci.org/internetarchive/brozzler/builds/174338601 # see https://travis-ci.org/internetarchive/brozzler/builds/174338601
# but it complains that /usr/lib/python3.4/site-packages doesn't exist # but it complains that /usr/lib/python3.5/site-packages doesn't exist
# see https://travis-ci.org/internetarchive/brozzler/builds/174094831 # see https://travis-ci.org/internetarchive/brozzler/builds/174094831
- file: path={{item}} state=directory - file:
path: '{{item}}'
state: directory
with_items: with_items:
- /usr/lib/python3.4/site-packages - /usr/lib/python3.5/site-packages
- /usr/lib/python3.4/dist-packages - /usr/lib/python3.5/dist-packages
become: true become: true
- name: run "python3 setup.py install" in /tmp/pip-9.0.1
command: python3 setup.py install
chdir=/tmp/pip-9.0.1
creates=/usr/local/lib/python3.4/dist-packages/pip-9.0.1-py3.4.egg/pip/__init__.py
become: true
- name: run "pip install virtualenv"
command: pip install virtualenv
creates=/usr/local/lib/python3.4/dist-packages/virtualenv.py
become: true
- command: id {{user}} - command: id {{user}}
register: id_user register: id_user
ignore_errors: true ignore_errors: true
changed_when: false changed_when: false
- name: ensure service user {{user}} exists - name: ensure service user {{user}} exists
user: name={{user}} system=yes createhome=no home=/nonexistent user: name={{user}} system=yes createhome=no home=/nonexistent
shell=/usr/sbin/nologin shell=/usr/sbin/nologin

View file

@ -1,5 +1,9 @@
--- ---
- name: restart pywb - name: restart pywb
service: name=pywb state=restarted svc:
name: pywb
state: restarted
service_dir: /etc/service
become: true become: true

View file

@ -1,36 +1,52 @@
--- ---
- name: mkdir {{venv_root}}/pywb-ve34 - name: mkdir {{venv_root}}/pywb-ve3
file: path={{venv_root}}/pywb-ve34 state=directory file: path={{venv_root}}/pywb-ve3 state=directory
owner={{user}} owner={{user}}
become: true become: true
- name: install pywb in virtualenv - name: install pywb in virtualenv
pip: name=pywb pip:
version=0.33.2 name: pywb
virtualenv={{venv_root}}/pywb-ve34 version: 0.33.2
virtualenv_python=python3.4 virtualenv: '{{venv_root}}/pywb-ve3'
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true become: true
become_user: '{{user}}' become_user: '{{user}}'
notify: notify:
- restart pywb - restart pywb
- name: install brozzler in pywb virtualenv - name: install brozzler in pywb virtualenv
pip: name='{{brozzler_pip_name}}' pip:
virtualenv={{venv_root}}/pywb-ve34 name: '{{brozzler_pip_name}}'
virtualenv_python=python3.4 virtualenv: '{{venv_root}}/pywb-ve3'
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true become: true
become_user: '{{user}}' become_user: '{{user}}'
notify: notify:
- restart pywb - restart pywb
- name: pywb config file /etc/pywb.yml - name: pywb config file /etc/pywb.yml
template: src=templates/pywb.yml.j2 template: src=templates/pywb.yml.j2
dest=/etc/pywb.yml dest=/etc/pywb.yml
become: true become: true
notify: notify:
- restart pywb - restart pywb
- name: upstart config file /etc/init/pywb.conf
template: src=templates/pywb.conf.j2 - name: mkdir /etc/service/pywb
dest=/etc/init/pywb.conf file:
path: /etc/service/pywb
state: directory
become: true become: true
- name: install /etc/service/pywb/run
template:
src: templates/pywb-run.j2
dest: /etc/service/pywb/run
mode: 0755
notify: notify:
- restart pywb - restart pywb
become: true

View file

@ -0,0 +1,10 @@
#!/bin/bash
logfile=/var/log/pywb.log
touch $logfile
chown {{user}} $logfile
exec nice setuidgid {{user}} env PYWB_CONFIG_FILE=/etc/pywb.yml \
{{venv_root}}/pywb-ve3/bin/python {{venv_root}}/pywb-ve3/bin/brozzler-wayback \
>> $logfile 2>&1

View file

@ -1,14 +0,0 @@
description "pywb"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH={{venv_root}}/pywb-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/pywb-ve34/bin:/usr/bin:/bin
env PYWB_CONFIG_FILE=/etc/pywb.yml
setuid {{user}}
console log
exec nice brozzler-wayback

View file

@ -3,8 +3,9 @@
apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
become: true become: true
- name: ensure rethinkdb repo is in apt sources.list - name: ensure rethinkdb repo is in apt sources.list
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main' apt_repository:
state=present repo: 'deb http://download.rethinkdb.com/apt {{ansible_lsb.codename|lower}} main'
state: present
become: true become: true
- apt: update_cache=yes - apt: update_cache=yes
become: true become: true

View file

@ -1,4 +1,7 @@
--- ---
- name: restart warcprox - name: restart warcprox
service: name=warcprox state=restarted svc:
name: warcprox
state: restarted
service_dir: /etc/service
become: true become: true

View file

@ -4,26 +4,37 @@
apt: name={{item}} state=present apt: name={{item}} state=present
with_items: with_items:
- gcc - gcc
- python3.4 - python3-dev
- libpython3.4-dev
- libffi-dev - libffi-dev
- libssl-dev - libssl-dev
- tor - tor
- git - git
- name: mkdir {{venv_root}}/warcprox-ve34 - name: mkdir {{venv_root}}/warcprox-ve3
become: true become: true
file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}} file: path={{venv_root}}/warcprox-ve3 state=directory owner={{user}}
- name: install warcprox in virtualenv - name: install warcprox in virtualenv
pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox pip:
virtualenv={{venv_root}}/warcprox-ve34 name: git+https://github.com/internetarchive/warcprox.git#egg=warcprox
virtualenv_python=python3.4 virtualenv: '{{venv_root}}/warcprox-ve3'
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' virtualenv_python: python3
extra_args: --no-input --upgrade --pre --cache-dir=/tmp/pip-cache
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
become: true become: true
become_user: '{{user}}' become_user: '{{user}}'
notify: notify:
- restart warcprox - restart warcprox
- name: install upstart config /etc/init/warcprox.conf
- name: mkdir /etc/service/warcprox
file:
path: /etc/service/warcprox
state: directory
become: true become: true
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
- name: install /etc/service/warcprox/run
template:
src: templates/run.j2
dest: /etc/service/warcprox/run
mode: 0755
notify: notify:
- restart warcprox - restart warcprox
become: true

View file

@ -1,19 +1,16 @@
description "warcprox" #!/bin/bash
start on runlevel [2345] logfile=/var/log/warcprox.log
stop on runlevel [!2345] touch $logfile
chown {{user}} $logfile
env PYTHONPATH={{venv_root}}/warcprox-ve34/lib/python3.4/site-packages ulimit -n 4096
env PATH={{venv_root}}/warcprox-ve34/bin:/usr/bin:/bin
# by default warcprox creates some files/dirs relative to cwd cd {{work_dir}}
chdir {{work_dir}}
setuid {{user}}
console log source {{venv_root}}/warcprox-ve3/bin/activate
# --profile exec nice -n5 setuidgid {{user}} env LANG=en_US.UTF-8 LC_COLLATE=C warcprox \
exec nice warcprox \
--address=0.0.0.0 \ --address=0.0.0.0 \
--dir={{warcs_dir}} \ --dir={{warcs_dir}} \
--base32 \ --base32 \
@ -22,4 +19,6 @@ exec nice warcprox \
--onion-tor-socks-proxy=localhost:9050 \ --onion-tor-socks-proxy=localhost:9050 \
--rethinkdb-services-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/services \ --rethinkdb-services-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/services \
--rethinkdb-stats-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/stats \ --rethinkdb-stats-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/stats \
--rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures --rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures \
>> $logfile 2>&1

View file

@ -162,7 +162,7 @@ class ThreadExceptionGate:
def queue_exception(self, e): def queue_exception(self, e):
with self.lock: with self.lock:
if self.pending_exception: if self.pending_exception:
self.logger.warn( self.logger.warning(
'%r already pending for thread %r, discarding %r', '%r already pending for thread %r, discarding %r',
self.pending_exception, self.thread, e) self.pending_exception, self.thread, e)
else: else:

View file

@ -223,7 +223,7 @@ class Chrome:
raise raise
except Exception as e: except Exception as e:
if time.time() - self._last_warning > 30: if time.time() - self._last_warning > 30:
self.logger.warn( self.logger.warning(
'problem with %s (will keep trying until timeout ' 'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e) 'of %d seconds): %s', json_url, timeout_sec, e)
self._last_warning = time.time() self._last_warning = time.time()
@ -294,7 +294,7 @@ class Chrome:
'chrome pid %s exited normally', 'chrome pid %s exited normally',
self.chrome_process.pid) self.chrome_process.pid)
else: else:
self.logger.warn( self.logger.warning(
'chrome pid %s exited with nonzero status %s', 'chrome pid %s exited with nonzero status %s',
self.chrome_process.pid, status) self.chrome_process.pid, status)
@ -305,13 +305,13 @@ class Chrome:
return return
time.sleep(0.5) time.sleep(0.5)
self.logger.warn( self.logger.warning(
'chrome pid %s still alive %.1f seconds after sending ' 'chrome pid %s still alive %.1f seconds after sending '
'SIGTERM, sending SIGKILL', self.chrome_process.pid, 'SIGTERM, sending SIGKILL', self.chrome_process.pid,
time.time() - t0) time.time() - t0)
os.killpg(self.chrome_process.pid, signal.SIGKILL) os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait() status = self.chrome_process.wait()
self.logger.warn( self.logger.warning(
'chrome pid %s reaped (status=%s) after killing with ' 'chrome pid %s reaped (status=%s) after killing with '
'SIGKILL', self.chrome_process.pid, status) 'SIGKILL', self.chrome_process.pid, status)

View file

@ -2,7 +2,7 @@
''' '''
brozzler/cli.py - brozzler command line executables brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2017 Internet Archive Copyright (C) 2014-2019 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -606,6 +606,10 @@ def brozzler_purge(argv=None):
'--site', dest='site', metavar='SITE_ID', help=( '--site', dest='site', metavar='SITE_ID', help=(
'purge crawl state from rethinkdb for a site, including all ' 'purge crawl state from rethinkdb for a site, including all '
'pages')) 'pages'))
group.add_argument(
'--finished-before', dest='finished_before', metavar='YYYY-MM-DD',
help=('purge crawl state from rethinkdb for a jobs that ended '
'before this date'))
arg_parser.add_argument( arg_parser.add_argument(
'--force', dest='force', action='store_true', help=( '--force', dest='force', action='store_true', help=(
'purge even if job or site is still has status ACTIVE')) 'purge even if job or site is still has status ACTIVE'))
@ -628,7 +632,7 @@ def brozzler_purge(argv=None):
sys.exit(1) sys.exit(1)
if job.status == 'ACTIVE': if job.status == 'ACTIVE':
if args.force: if args.force:
logging.warn( logging.warning(
'job %s has status ACTIVE, purging anyway because ' 'job %s has status ACTIVE, purging anyway because '
'--force was supplied', job_id) '--force was supplied', job_id)
else: else:
@ -645,7 +649,7 @@ def brozzler_purge(argv=None):
sys.exit(1) sys.exit(1)
if site.status == 'ACTIVE': if site.status == 'ACTIVE':
if args.force: if args.force:
logging.warn( logging.warning(
'site %s has status ACTIVE, purging anyway because ' 'site %s has status ACTIVE, purging anyway because '
'--force was supplied', site_id) '--force was supplied', site_id)
else: else:
@ -654,6 +658,20 @@ def brozzler_purge(argv=None):
'(override with --force)', site_id) '(override with --force)', site_id)
sys.exit(1) sys.exit(1)
_purge_site(rr, site_id) _purge_site(rr, site_id)
elif args.finished_before:
finished_before = datetime.datetime.strptime(
args.finished_before, '%Y-%m-%d').replace(
tzinfo=doublethink.UTC)
reql = rr.table('jobs').filter(
r.row['finished'].default(r.maxval).lt(finished_before).or_(
r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before)))
logging.debug(
'retrieving jobs older than %s: %s', finished_before, reql)
for job in reql.run():
# logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s',
# job['id'], job.get('finished'),
# job.get('starts_and_stops', [{'stop':None}])[-1]['stop'])
_purge_job(rr, job['id'])
def _purge_site(rr, site_id): def _purge_site(rr, site_id):
reql = rr.table('pages').between( reql = rr.table('pages').between(
@ -713,7 +731,7 @@ def brozzler_list_captures(argv=None):
if args.url_or_sha1[:5] == 'sha1:': if args.url_or_sha1[:5] == 'sha1:':
if args.prefix: if args.prefix:
logging.warn( logging.warning(
'ignoring supplied --prefix option which does not apply ' 'ignoring supplied --prefix option which does not apply '
'to lookup by sha1') 'to lookup by sha1')
# assumes it's already base32 (XXX could detect if hex and convert) # assumes it's already base32 (XXX could detect if hex and convert)

View file

@ -260,7 +260,7 @@ class BrozzlerEasyController:
state_strs.append(str(th)) state_strs.append(str(th))
stack = traceback.format_stack(sys._current_frames()[th.ident]) stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append(''.join(stack)) state_strs.append(''.join(stack))
logging.warn('dumping state (caught signal {})\n{}'.format( logging.warning('dumping state (caught signal {})\n{}'.format(
signum, '\n'.join(state_strs))) signum, '\n'.join(state_strs)))
def main(argv=None): def main(argv=None):

View file

@ -138,7 +138,7 @@ class RethinkDbFrontier:
sites = [] sites = []
for i in range(result["replaced"]): for i in range(result["replaced"]):
if result["changes"][i]["old_val"]["claimed"]: if result["changes"][i]["old_val"]["claimed"]:
self.logger.warn( self.logger.warning(
"re-claimed site that was still marked 'claimed' " "re-claimed site that was still marked 'claimed' "
"because it was last claimed a long time ago " "because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from " "at %s, and presumably some error stopped it from "
@ -225,7 +225,7 @@ class RethinkDbFrontier:
if not job: if not job:
return False return False
if job.status.startswith("FINISH"): if job.status.startswith("FINISH"):
self.logger.warn("%s is already %s", job, job.status) self.logger.warning("%s is already %s", job, job.status)
return True return True
results = self.rr.table("sites").get_all(job_id, index="job_id").run() results = self.rr.table("sites").get_all(job_id, index="job_id").run()
@ -415,7 +415,7 @@ class RethinkDbFrontier:
assert isinstance(e, brozzler.ReachedLimit) assert isinstance(e, brozzler.ReachedLimit)
if (site.reached_limit if (site.reached_limit
and site.reached_limit != e.warcprox_meta["reached-limit"]): and site.reached_limit != e.warcprox_meta["reached-limit"]):
self.logger.warn( self.logger.warning(
"reached limit %s but site had already reached limit %s", "reached limit %s but site had already reached limit %s",
e.warcprox_meta["reached-limit"], self.reached_limit) e.warcprox_meta["reached-limit"], self.reached_limit)
else: else:
@ -434,7 +434,7 @@ class RethinkDbFrontier:
index="priority_by_site").filter({"hops_from_seed":0}).run() index="priority_by_site").filter({"hops_from_seed":0}).run()
pages = list(results) pages = list(results)
if len(pages) > 1: if len(pages) > 1:
self.logger.warn( self.logger.warning(
"more than one seed page for site_id %s ?", site_id) "more than one seed page for site_id %s ?", site_id)
if len(pages) < 1: if len(pages) < 1:
return None return None

View file

@ -106,7 +106,7 @@ def is_permitted_by_robots(site, url, proxy=None):
# reppy has wrapped an exception that we want to bubble up # reppy has wrapped an exception that we want to bubble up
raise brozzler.ProxyError(e) raise brozzler.ProxyError(e)
else: else:
logging.warn( logging.warning(
"returning true (permitted) after problem fetching " "returning true (permitted) after problem fetching "
"robots.txt for %r: %r", url, e) "robots.txt for %r: %r", url, e)
return True return True

View file

@ -147,13 +147,13 @@ class BrozzlerWorker:
try: try:
with urllib.request.urlopen(request, timeout=600) as response: with urllib.request.urlopen(request, timeout=600) as response:
if response.getcode() != 204: if response.getcode() != 204:
self.logger.warn( self.logger.warning(
'got "%s %s" response on warcprox ' 'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)', 'WARCPROX_WRITE_RECORD request (expected 204)',
response.getcode(), response.reason) response.getcode(), response.reason)
return request, response return request, response
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
self.logger.warn( self.logger.warning(
'got "%s %s" response on warcprox ' 'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)', 'WARCPROX_WRITE_RECORD request (expected 204)',
e.getcode(), e.info()) e.getcode(), e.info())
@ -370,7 +370,7 @@ class BrozzlerWorker:
if (page.needs_robots_check and if (page.needs_robots_check and
not brozzler.is_permitted_by_robots( not brozzler.is_permitted_by_robots(
site, page.url, self._proxy_for(site))): site, page.url, self._proxy_for(site))):
logging.warn("page %s is blocked by robots.txt", page.url) logging.warning("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True page.blocked_by_robots = True
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
else: else:
@ -544,7 +544,7 @@ class BrozzlerWorker:
def start(self): def start(self):
with self._start_stop_lock: with self._start_stop_lock:
if self._thread: if self._thread:
self.logger.warn( self.logger.warning(
'ignoring start request because self._thread is ' 'ignoring start request because self._thread is '
'not None') 'not None')
return return

View file

@ -48,7 +48,7 @@ _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_rea
def _webpage_read_content(self, *args, **kwargs): def _webpage_read_content(self, *args, **kwargs):
content = _orig_webpage_read_content(self, *args, **kwargs) content = _orig_webpage_read_content(self, *args, **kwargs)
if len(content) > 20000000: if len(content) > 20000000:
logging.warn( logging.warning(
'bypassing youtube-dl extraction because content is ' 'bypassing youtube-dl extraction because content is '
'too large (%s characters)', len(content)) 'too large (%s characters)', len(content))
return '' return ''
@ -185,7 +185,7 @@ def _build_youtube_dl(worker, destdir, site):
mimetype = magic.from_file(ctx['filename'], mime=True) mimetype = magic.from_file(ctx['filename'], mime=True)
except ImportError as e: except ImportError as e:
mimetype = 'video/%s' % info_dict['ext'] mimetype = 'video/%s' % info_dict['ext']
self.logger.warn( self.logger.warning(
'guessing mimetype %s because %r', mimetype, e) 'guessing mimetype %s because %r', mimetype, e)
url = 'youtube-dl:%05d:%s' % ( url = 'youtube-dl:%05d:%s' % (

View file

@ -339,12 +339,12 @@ Brozzler derives its general approach to the seed surt from `heritrix
slash. slash.
2. Canonicalization does not attempt to match heritrix exactly, though it 2. Canonicalization does not attempt to match heritrix exactly, though it
usually does match. usually does match.
3. When generating a SURT for an HTTPS URL, heritrix changes the scheme to 3. Brozzler does no scheme munging. (When generating a SURT for an HTTPS URL,
HTTP. For example, the heritrix SURT for ``https://www.example.com/`` is heritrix changes the scheme to HTTP. For example, the heritrix SURT for
``http://(com,example,www,)`` and this means that all of ``https://www.example.com/`` is ``http://(com,example,www,)`` and this means
``http://www.example.com/*`` and ``https://www.example.com/*`` are in that all of ``http://www.example.com/*`` and ``https://www.example.com/*``
scope. It also means that a manually specified SURT with scheme "https" does are in scope. It also means that a manually specified SURT with scheme
not match anything. Brozzler does no scheme munging. "https" does not match anything.)
4. Brozzler identifies seed "redirects" by retrieving the URL from the 4. Brozzler identifies seed "redirects" by retrieving the URL from the
browser's location bar at the end of brozzling the seed page, whereas browser's location bar at the end of brozzling the seed page, whereas
heritrix follows HTTP 3XX redirects. If the URL in the browser heritrix follows HTTP 3XX redirects. If the URL in the browser

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.5.4', version='1.5.6',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',
@ -64,10 +64,10 @@ setuptools.setup(
], ],
}, },
install_requires=[ install_requires=[
'PyYAML>=3.12', 'PyYAML>=5.1',
'youtube-dl>=2018.7.21', 'youtube-dl>=2018.7.21',
'reppy==0.3.4', 'reppy==0.3.4',
'requests>=2.18.4', 'requests>=2.21',
'websocket-client>=0.39.0,<=0.48.0', 'websocket-client>=0.39.0,<=0.48.0',
'pillow>=5.2.0', 'pillow>=5.2.0',
'urlcanon>=0.1.dev23', 'urlcanon>=0.1.dev23',
@ -80,13 +80,13 @@ setuptools.setup(
], ],
extras_require={ extras_require={
'dashboard': [ 'dashboard': [
'flask>=0.11', 'flask>=1.0',
'gunicorn>=19.8.1' 'gunicorn>=19.8.1'
], ],
'easy': [ 'easy': [
'warcprox>=2.4b2.dev173', 'warcprox>=2.4b2.dev173',
'pywb>=0.33.2,<2', 'pywb>=0.33.2,<2',
'flask>=0.11', 'flask>=1.0',
'gunicorn>=19.8.1' 'gunicorn>=19.8.1'
], ],
}, },

View file

@ -67,8 +67,8 @@ def httpd(request):
self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"') self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
self.send_header('Content-type', 'text/html') self.send_header('Content-type', 'text/html')
self.end_headers() self.end_headers()
self.wfile.write(self.headers.getheader('Authorization')) self.wfile.write(self.headers.get('Authorization', b''))
self.wfile.write('not authenticated') self.wfile.write(b'not authenticated')
else: else:
super().do_GET() super().do_GET()

View file

@ -34,16 +34,41 @@ import http.server
import logging import logging
import warcprox import warcprox
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
def _local_address():
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
return s.getsockname()[0]
except:
return '127.0.0.1'
finally:
s.close()
local_address = _local_address()
def start_service(service): def start_service(service):
subprocess.check_call(['sudo', 'service', service, 'start']) subprocess.check_call(['sudo', 'svc', '-u', '/etc/service/' + service])
def stop_service(service): def stop_service(service):
subprocess.check_call(['sudo', 'service', service, 'stop']) subprocess.check_call(['sudo', 'svc', '-d', '/etc/service/' + service])
while True:
status = subprocess.check_output(
['sudo', 'svstat', '/etc/service/' + service])
if b' down ' in status:
break
time.sleep(0.5)
@pytest.fixture(scope='module') @pytest.fixture(scope='module')
def httpd(request): def httpd(request):
class RequestHandler(http.server.SimpleHTTPRequestHandler): class RequestHandler(http.server.SimpleHTTPRequestHandler):
def do_POST(self):
logging.info('\n%s\n%s', self.requestline, self.headers)
self.do_GET()
def do_GET(self): def do_GET(self):
logging.info('\n%s\n%s', self.requestline, self.headers)
if self.path == '/site5/redirect/': if self.path == '/site5/redirect/':
self.send_response(303, 'See other') self.send_response(303, 'See other')
self.send_header('Connection', 'close') self.send_header('Connection', 'close')
@ -82,7 +107,7 @@ def httpd(request):
# SimpleHTTPRequestHandler always uses CWD so we have to chdir # SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler) httpd = http.server.HTTPServer((local_address, 0), RequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start() httpd_thread.start()
@ -94,6 +119,9 @@ def httpd(request):
return httpd return httpd
def make_url(httpd, rel_url):
return 'http://%s:%s%s' % (local_address, httpd.server_port, rel_url)
def test_httpd(httpd): def test_httpd(httpd):
''' '''
Tests that our http server is working as expected, and that two fetches Tests that our http server is working as expected, and that two fetches
@ -101,7 +129,7 @@ def test_httpd(httpd):
deduplication. deduplication.
''' '''
payload1 = content2 = None payload1 = content2 = None
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port url = make_url(httpd, '/site1/file1.txt')
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
assert response.status == 200 assert response.status == 200
payload1 = response.read() payload1 = response.read()
@ -140,13 +168,13 @@ def test_brozzle_site(httpd):
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'seed': make_url(httpd, '/site1/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
# the two pages we expect to be crawled # the two pages we expect to be crawled
page1 = 'http://localhost:%s/site1/' % httpd.server_port page1 = make_url(httpd, '/site1/')
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port page2 = make_url(httpd, '/site1/file1.txt')
robots = 'http://localhost:%s/robots.txt' % httpd.server_port robots = make_url(httpd, '/robots.txt')
# so we can examine rethinkdb before it does anything # so we can examine rethinkdb before it does anything
try: try:
@ -171,8 +199,7 @@ def test_brozzle_site(httpd):
pages = list(frontier.site_pages(site.id)) pages = list(frontier.site_pages(site.id))
assert len(pages) == 2 assert len(pages) == 2
assert {page.url for page in pages} == { assert {page.url for page in pages} == {
'http://localhost:%s/site1/' % httpd.server_port, make_url(httpd, '/site1/'), make_url(httpd, '/site1/file1.txt')}
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table # take a look at the captures table
@ -255,8 +282,8 @@ def test_proxy_non_warcprox(httpd):
start_service('brozzler-worker') start_service('brozzler-worker')
assert len(proxy.requests) <= 15 assert len(proxy.requests) <= 15
assert proxy.requests.count('GET /status') == 1 assert proxy.requests.count('GET /status') == 1
assert ('GET http://localhost:%s/site1/' % httpd.server_port) in proxy.requests assert ('GET %s' % make_url(httpd, '/site1/')) in proxy.requests
assert ('GET http://localhost:%s/site1/file1.txt' % httpd.server_port) in proxy.requests assert ('GET %s' % make_url(httpd, '/site1/file1.txt')) in proxy.requests
assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == [] assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == []
proxy.shutdown() proxy.shutdown()
@ -292,14 +319,14 @@ def _test_proxy_setting(
datetime.datetime.utcnow().isoformat()) datetime.datetime.utcnow().isoformat())
# the two pages we expect to be crawled # the two pages we expect to be crawled
page1 = 'http://localhost:%s/site1/' % httpd.server_port page1 = make_url(httpd, '/site1/')
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port page2 = make_url(httpd, '/site1/file1.txt')
robots = 'http://localhost:%s/robots.txt' % httpd.server_port robots = make_url(httpd, '/robots.txt')
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
service_registry = doublethink.ServiceRegistry(rr) service_registry = doublethink.ServiceRegistry(rr)
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'seed': make_url(httpd, '/site1/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
assert site.id is None assert site.id is None
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
@ -332,8 +359,8 @@ def _test_proxy_setting(
pages = list(frontier.site_pages(site.id)) pages = list(frontier.site_pages(site.id))
assert len(pages) == 2 assert len(pages) == 2
assert {page.url for page in pages} == { assert {page.url for page in pages} == {
'http://localhost:%s/site1/' % httpd.server_port, make_url(httpd, '/site1/'),
'http://localhost:%s/site1/file1.txt' % httpd.server_port} make_url(httpd, '/site1/file1.txt')}
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table # take a look at the captures table
@ -360,7 +387,7 @@ def test_obey_robots(httpd):
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'seed': make_url(httpd, '/site1/'),
'user_agent': 'im a badbot', # robots.txt blocks badbot 'user_agent': 'im a badbot', # robots.txt blocks badbot
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -390,12 +417,12 @@ def test_obey_robots(httpd):
pages = list(frontier.site_pages(site.id)) pages = list(frontier.site_pages(site.id))
assert len(pages) == 1 assert len(pages) == 1
page = pages[0] page = pages[0]
assert page.url == 'http://localhost:%s/site1/' % httpd.server_port assert page.url == make_url(httpd, '/site1/')
assert page.blocked_by_robots assert page.blocked_by_robots
# take a look at the captures table # take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port robots_url = make_url(httpd, '/robots.txt')
captures = list(rr.table('captures').filter({'test_id':test_id}).run()) captures = list(rr.table('captures').filter({'test_id':test_id}).run())
assert len(captures) == 1 assert len(captures) == 1
assert captures[0]['url'] == robots_url assert captures[0]['url'] == robots_url
@ -412,7 +439,7 @@ def test_login(httpd):
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site2/' % httpd.server_port, 'seed': make_url(httpd, '/site2/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
'username': 'test_username', 'password': 'test_password'}) 'username': 'test_username', 'password': 'test_password'})
@ -428,7 +455,7 @@ def test_login(httpd):
# take a look at the captures table # take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port robots_url = make_url(httpd, '/robots.txt')
captures = list(rr.table('captures').filter( captures = list(rr.table('captures').filter(
{'test_id':test_id}).order_by('timestamp').run()) {'test_id':test_id}).order_by('timestamp').run())
meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
@ -436,25 +463,25 @@ def test_login(httpd):
# there are several forms in in htdocs/site2/login.html but only one # there are several forms in in htdocs/site2/login.html but only one
# that brozzler's heuristic should match and try to submit, and it has # that brozzler's heuristic should match and try to submit, and it has
# action='00', so we can check for that here # action='00', so we can check for that here
assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url assert ('POST %s' % make_url(httpd, '/site2/00')) in meth_url
# sanity check the rest of the crawl # sanity check the rest of the crawl
assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url assert ('GET %s' % make_url(httpd, '/robots.txt')) in meth_url
assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('GET %s' % make_url(httpd, '/site2/')) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/')) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/')) in meth_url
assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('GET %s' % make_url(httpd, '/site2/login.html')) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/login.html')) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/login.html')) in meth_url
def test_seed_redirect(httpd): def test_seed_redirect(httpd):
test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat() test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port seed_url = make_url(httpd, '/site5/redirect/')
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'seed': make_url(httpd, '/site5/redirect/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]} assert site.scope == {'accepts': [{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}]}
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
brozzler.new_site(frontier, site) brozzler.new_site(frontier, site)
@ -473,19 +500,19 @@ def test_seed_redirect(httpd):
pages.sort(key=lambda page: page.hops_from_seed) pages.sort(key=lambda page: page.hops_from_seed)
assert pages[0].hops_from_seed == 0 assert pages[0].hops_from_seed == 0
assert pages[0].url == seed_url assert pages[0].url == seed_url
assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port assert pages[0].redirect_url == make_url(httpd, '/site5/destination/')
assert pages[1].hops_from_seed == 1 assert pages[1].hops_from_seed == 1
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port assert pages[1].url == make_url(httpd, '/site5/destination/page2.html')
# check that scope has been updated properly # check that scope has been updated properly
assert site.scope == {'accepts': [ assert site.scope == {'accepts': [
{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}, {'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)},
{'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]} {'ssurt': '%s//%s:http:/site5/destination/' % (local_address, httpd.server_port)}]}
def test_hashtags(httpd): def test_hashtags(httpd):
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site7/' % httpd.server_port seed_url = make_url(httpd, '/site7/')
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': seed_url, 'seed': seed_url,
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -507,9 +534,9 @@ def test_hashtags(httpd):
assert pages[0].url == seed_url assert pages[0].url == seed_url
assert pages[0].hops_from_seed == 0 assert pages[0].hops_from_seed == 0
assert pages[0].brozzle_count == 1 assert pages[0].brozzle_count == 1
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port] assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site7/foo.html')]
assert not pages[0].hashtags assert not pages[0].hashtags
assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port assert pages[1].url == make_url(httpd, '/site7/foo.html')
assert pages[1].hops_from_seed == 1 assert pages[1].hops_from_seed == 1
assert pages[1].brozzle_count == 1 assert pages[1].brozzle_count == 1
assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',] assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
@ -520,18 +547,18 @@ def test_hashtags(httpd):
captures_by_url = { captures_by_url = {
c['url']: c for c in captures if c['http_method'] != 'HEAD'} c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert seed_url in captures_by_url assert seed_url in captures_by_url
assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert make_url(httpd, '/site7/foo.html') in captures_by_url
assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url assert make_url(httpd, '/site7/whee.txt') in captures_by_url
assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url assert make_url(httpd, '/site7/boosh.txt') in captures_by_url
assert 'screenshot:%s' % seed_url in captures_by_url assert 'screenshot:%s' % seed_url in captures_by_url
assert 'thumbnail:%s' % seed_url in captures_by_url assert 'thumbnail:%s' % seed_url in captures_by_url
assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'screenshot:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'thumbnail:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
def test_redirect_hashtags(httpd): def test_redirect_hashtags(httpd):
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site9/' % httpd.server_port seed_url = make_url(httpd, '/site9/')
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': seed_url, 'seed': seed_url,
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -553,9 +580,9 @@ def test_redirect_hashtags(httpd):
assert pages[0].url == seed_url assert pages[0].url == seed_url
assert pages[0].hops_from_seed == 0 assert pages[0].hops_from_seed == 0
assert pages[0].brozzle_count == 1 assert pages[0].brozzle_count == 1
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port] assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site9/redirect.html')]
assert not pages[0].hashtags assert not pages[0].hashtags
assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port assert pages[1].url == make_url(httpd, '/site9/redirect.html')
assert pages[1].hops_from_seed == 1 assert pages[1].hops_from_seed == 1
assert pages[1].brozzle_count == 1 assert pages[1].brozzle_count == 1
assert sorted(pages[1].hashtags) == ['#hash1','#hash2',] assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]
@ -563,7 +590,7 @@ def test_redirect_hashtags(httpd):
time.sleep(2) # in case warcprox hasn't finished processing urls time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table # take a look at the captures table
captures = rr.table('captures').filter({'test_id':test_id}).run() captures = rr.table('captures').filter({'test_id':test_id}).run()
redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET'] redirect_captures = [c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET']
assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
# === expected captures === # === expected captures ===
@ -589,9 +616,9 @@ def test_stop_crawl(httpd):
# create a new job with three sites that could be crawled forever # create a new job with three sites that could be crawled forever
job_conf = {'seeds': [ job_conf = {'seeds': [
{'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port}, {'url': make_url(httpd, '/infinite/foo/')},
{'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port}, {'url': make_url(httpd, '/infinite/bar/')},
{'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]} {'url': make_url(httpd, '/infinite/baz/')}]}
job = brozzler.new_job(frontier, job_conf) job = brozzler.new_job(frontier, job_conf)
assert job.id assert job.id
@ -675,7 +702,7 @@ def test_warcprox_outage_resiliency(httpd):
# put together a site to crawl # put together a site to crawl
test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat() test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/infinite/' % httpd.server_port, 'seed': make_url(httpd, '/infinite/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
try: try:
@ -684,7 +711,7 @@ def test_warcprox_outage_resiliency(httpd):
try: try:
stop_service('warcprox') stop_service('warcprox')
except Exception as e: except Exception as e:
logging.warn('problem stopping warcprox service: %s', e) logging.warning('problem stopping warcprox service: %s', e)
# queue the site for brozzling # queue the site for brozzling
brozzler.new_site(frontier, site) brozzler.new_site(frontier, site)
@ -771,7 +798,7 @@ def test_time_limit(httpd):
# create a new job with one seed that could be crawled forever # create a new job with one seed that could be crawled forever
job_conf = {'seeds': [{ job_conf = {'seeds': [{
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port, 'url': make_url(httpd, '/infinite/foo/'),
'time_limit': 20}]} 'time_limit': 20}]}
job = brozzler.new_job(frontier, job_conf) job = brozzler.new_job(frontier, job_conf)
assert job.id assert job.id
@ -801,7 +828,7 @@ def test_ydl_stitching(httpd):
rr = doublethink.Rethinker('localhost', db='brozzler') rr = doublethink.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site10/' % httpd.server_port, 'seed': make_url(httpd, '/site10/'),
'warcprox_meta': { 'warcprox_meta': {
'warc-prefix': 'test_ydl_stitching', 'warc-prefix': 'test_ydl_stitching',
'captures-table-extra-fields': {'test_id':test_id}}}) 'captures-table-extra-fields': {'test_id':test_id}}})
@ -819,7 +846,7 @@ def test_ydl_stitching(httpd):
assert len(pages) == 1 assert len(pages) == 1
page = pages[0] page = pages[0]
assert len(page.videos) == 6 assert len(page.videos) == 6
stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/')
assert { assert {
'blame': 'youtube-dl', 'blame': 'youtube-dl',
'content-length': 267900, 'content-length': 267900,

View file

@ -24,27 +24,27 @@ the brozzler virtualenv.
:: ::
my-laptop$ vagrant ssh my-laptop$ vagrant ssh
vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate vagrant@brzl:~$ source /opt/brozzler-ve3/bin/activate
(brozzler-ve34)vagrant@brzl:~$ (brozzler-ve3)vagrant@brzl:~$
Then you can run brozzler-new-site: Then you can run brozzler-new-site:
:: ::
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/ (brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
Or brozzler-new-job (make sure to set the proxy to localhost:8000): Or brozzler-new-job (make sure to set the proxy to localhost:8000):
:: ::
(brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <<EOF (brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
id: job1 id: job1
proxy: localhost:8000 # point at warcprox for archiving proxy: localhost:8000 # point at warcprox for archiving
seeds: seeds:
- url: https://example.org/ - url: https://example.org/
EOF EOF
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-job job1.yml (brozzler-ve3)vagrant@brzl:~$ brozzler-new-job job1.yml
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
./logs (via vagrant folders syncing). ./logs (via vagrant folders syncing).

4
vagrant/Vagrantfile vendored
View file

@ -1,8 +1,9 @@
Vagrant.configure(2) do |config| Vagrant.configure(2) do |config|
config.vm.box = "ubuntu/trusty64" config.vm.box = "ubuntu/xenial64"
config.vm.define "10.9.9.9" config.vm.define "10.9.9.9"
config.vm.hostname = "brzl" config.vm.hostname = "brzl"
config.vm.network :private_network, ip: "10.9.9.9" config.vm.network :private_network, ip: "10.9.9.9"
config.disksize.size = '50GB'
config.vm.synced_folder "..", "/brozzler" config.vm.synced_folder "..", "/brozzler"
@ -14,6 +15,7 @@ Vagrant.configure(2) do |config|
config.vm.provision "ansible" do |ansible| config.vm.provision "ansible" do |ansible|
ansible.inventory_path = "../ansible/hosts-vagrant" ansible.inventory_path = "../ansible/hosts-vagrant"
ansible.playbook = "../ansible/playbook.yml" ansible.playbook = "../ansible/playbook.yml"
# ansible.verbose = "-vvv"
end end
config.vm.provider 'virtualbox' do |v| config.vm.provider 'virtualbox' do |v|

View file

@ -10,12 +10,12 @@ cd $(dirname "${BASH_SOURCE[0]}")
vagrant up vagrant up
echo service status: echo service status:
vagrant ssh -- 'status warcprox ; vagrant ssh -- 'sudo svstat /etc/service/warcprox ;
status Xvnc ; sudo svstat /etc/service/Xvnc ;
status brozzler-worker ; sudo svstat /etc/service/brozzler-worker ;
status brozzler-dashboard ; sudo svstat /etc/service/brozzler-dashboard ;
status vnc-websock' sudo svstat /etc/service/vnc-websock'
echo echo
vagrant ssh -- 'set -x ; source /opt/brozzler-ve34/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"' vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest==4.3.0 && pip install --upgrade --pre "warcprox>=2.1b1.dev86"'
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@" vagrant ssh -- "source /opt/brozzler-ve3/bin/activate && DISPLAY=:1 py.test --tb=native -v /brozzler/tests $@"

View file

@ -7,7 +7,7 @@ This is a standalone script with no dependencies other than python, and should
work with python 2.7 or python 3.2+. The only reason it's not a bash script is work with python 2.7 or python 3.2+. The only reason it's not a bash script is
so we can use the argparse library. so we can use the argparse library.
Copyright (C) 2016 Internet Archive Copyright (C) 2016-2019 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -41,9 +41,8 @@ def main(argv=[]):
subprocess.call([ subprocess.call([
'vagrant', 'ssh', '--', 'vagrant', 'ssh', '--',
'f=`mktemp` && cat > $f && ' 'f=`mktemp` && cat > $f && '
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' '/home/vagrant/brozzler-ve3/bin/python '
'/home/vagrant/brozzler-ve34/bin/python ' '/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
'/home/vagrant/brozzler-ve34/bin/brozzler-new-job $f'],
stdin=f) stdin=f)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -74,11 +74,8 @@ def main(argv=[]):
os.chdir(os.path.dirname(__file__)) os.chdir(os.path.dirname(__file__))
cmd = ( cmd = (
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' '/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
'/home/vagrant/brozzler-ve34/bin/python ' '%s %s') % (' '.join(options), args.seed)
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
'--proxy=localhost:8000 %s %s') % (
' '.join(options), args.seed)
subprocess.call(['vagrant', 'ssh', '--', cmd]) subprocess.call(['vagrant', 'ssh', '--', cmd])
if __name__ == '__main__': if __name__ == '__main__':