Merge branch 'typos' into qa

This commit is contained in:
Barbara Miller 2019-05-17 17:24:19 -07:00
commit 4ada3e01b7
40 changed files with 469 additions and 320 deletions

View File

@ -11,19 +11,22 @@ before_install:
- sudo pip install ansible==2.1.3.0
install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest==4.3.0
- chromium-browser --version
- sudo apt-get update
- sudo apt-get install --only-upgrade chromium-browser
- chromium-browser --version
- sudo service brozzler-worker restart
- ps ww -fHe
- sudo cat /var/log/Xvnc.log
- sudo cat /var/log/brozzler-worker.log
- sudo cat /var/log/warcprox.log
script:
- DISPLAY=:1 py.test --tb=native -v tests
after_failure:
- chromium-browser --version
- sudo cat /var/log/upstart/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log
- sudo cat /var/log/upstart/pywb.log
- sudo cat /var/log/warcprox.log
- sudo cat /var/log/brozzler-worker.log
- sudo cat /var/log/pywb.log
notifications:
slack:
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=

View File

@ -1,7 +1,9 @@
[all:vars]
warcs_dir=/vagrant/warcs
brozzler_pip_name='-e /brozzler'
# brozzler_pip_name='-e /brozzler' # not working anymore? :(
brozzler_pip_name='/brozzler'
user=vagrant
ansible_python_interpreter=/usr/bin/python3
### possible values for a prod deployment
# brozzler_pip_name=brozzler # get it from pypi
# brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler

View File

@ -1,4 +1,8 @@
---
- name: restart brozzler-dashboard
service: name=brozzler-dashboard state=restarted
svc:
name: brozzler-dashboard
state: restarted
service_dir: /etc/service
become: true

View File

@ -1,20 +1,33 @@
---
- name: mkdir {{venv_root}}/brozzler-dashboard-ve34
file: path={{venv_root}}/brozzler-dashboard-ve34 state=directory
- name: mkdir {{venv_root}}/brozzler-dashboard-ve3
file: path={{venv_root}}/brozzler-dashboard-ve3 state=directory
owner={{user}}
become: true
- name: install brozzler[dashboard] in virtualenv
pip: name='{{brozzler_pip_name}}[dashboard]'
virtualenv={{venv_root}}/brozzler-dashboard-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
pip:
name: '{{brozzler_pip_name}}[dashboard]'
virtualenv: '{{venv_root}}/brozzler-dashboard-ve3'
virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true
become_user: '{{user}}'
notify:
- restart brozzler-dashboard
- name: install upstart config /etc/init/brozzler-dashboard.conf
- name: mkdir /etc/service/brozzler-dashboard
file:
path: /etc/service/brozzler-dashboard
state: directory
become: true
template: src=templates/brozzler-dashboard.conf.j2
dest=/etc/init/brozzler-dashboard.conf
- name: install /etc/service/brozzler-dashboard/run
template:
src: templates/brozzler-dashboard-run.j2
dest: /etc/service/brozzler-dashboard/run
mode: 0755
notify:
- restart brozzler-dashboard
become: true

View File

@ -0,0 +1,15 @@
#!/bin/bash
logfile=/var/log/brozzler-dashboard.log
touch $logfile
chown {{user}} $logfile
source /opt/brozzler-dashboard-ve3/bin/activate
exec nice setuidgid {{user}} \
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler \
RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} \
RETHINKDB_DB=brozzler LANG=en_US.UTF-8 LC_COLLATE=C \
gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app \
>> $logfile 2>&1

View File

@ -1,18 +0,0 @@
description "brozzler-dashboard"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH={{venv_root}}/brozzler-dashboard-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/brozzler-dashboard-ve34/bin:/usr/bin:/bin
env LC_ALL=C.UTF-8
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
env RETHINKDB_DB=brozzler
setuid {{user}}
console log
exec gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app

View File

@ -1,13 +1,22 @@
---
- name: restart Xvnc
service: name=Xvnc state=restarted
become: true
- name: restart websockify
service: name=websockify state=restarted
svc:
name: Xvnc
state: restarted
service_dir: /etc/service
become: true
- name: restart vnc-websock
service: name=vnc-websock state=restarted
svc:
name: vnc-websock
state: restarted
service_dir: /etc/service
become: true
- name: restart brozzler-worker
service: name=brozzler-worker state=restarted
svc:
name: brozzler-worker
state: restarted
service_dir: /etc/service
become: true

View File

@ -3,14 +3,22 @@
apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
state=present
become: true
- apt: update_cache=yes
become: true
- name: ensure required packages are installed
become: true
apt: name={{item}} state=present
with_items:
- vnc4server
- chromium-browser
- vnc4server
- libjpeg-turbo8-dev
- zlib1g-dev
- gcc
- python3-dev
- python3-dbg
- adobe-flashplugin
- xfonts-base
- fonts-arphic-bkai00mp
- fonts-arphic-bsmi00lp
@ -24,51 +32,74 @@
- fonts-sil-padauk
- fonts-unfonts-extra
- fonts-unfonts-core
- ttf-indic-fonts
- fonts-indic
- fonts-thai-tlwg
- fonts-lklug-sinhala
- git
- libjpeg-turbo8-dev
- zlib1g-dev
- gcc
- g++
- libpython3.4-dev
- adobe-flashplugin
- name: install Xvnc upstart config /etc/init/Xvnc.conf
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
- name: mkdir /etc/service/{Xvnc,vnc-websock,brozzler-worker}
file:
path: '/etc/service/{{item}}'
state: directory
with_items:
- Xvnc
- vnc-websock
- brozzler-worker
become: true
- name: install /etc/service/Xvnc/run
template:
src: templates/Xvnc-run.j2
dest: /etc/service/Xvnc/run
mode: 0755
notify:
- restart Xvnc
- name: mkdir {{venv_root}}/websockify-ve34
become: true
file: path={{venv_root}}/websockify-ve34 state=directory owner={{user}}
- name: mkdir {{venv_root}}/websockify-ve3
become: true
file: path={{venv_root}}/websockify-ve3 state=directory owner={{user}}
- name: install websockify in virtualenv
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
virtualenv={{venv_root}}/websockify-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
pip:
name: git+https://github.com/kanaka/websockify.git#egg=websockify
virtualenv: '{{venv_root}}/websockify-ve3'
virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true
become_user: '{{user}}'
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
become: true
- name: install /etc/service/vnc-websock/run
template:
src: templates/vnc-websock-run.j2
dest: /etc/service/vnc-websock/run
mode: 0755
notify:
- restart vnc-websock
- name: mkdir {{venv_root}}/brozzler-ve34
become: true
file: path={{venv_root}}/brozzler-ve34 state=directory owner={{user}}
- name: mkdir {{venv_root}}/brozzler-ve3
become: true
file: path={{venv_root}}/brozzler-ve3 state=directory owner={{user}}
- name: install brozzler in virtualenv
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
name='{{brozzler_pip_name}}'
virtualenv={{venv_root}}/brozzler-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
pip:
name: '{{brozzler_pip_name}}'
virtualenv: '{{venv_root}}/brozzler-ve3'
virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true
become_user: '{{user}}'
notify:
- restart brozzler-worker
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
become: true
- name: install /etc/service/brozzler-worker/run
template:
src: templates/brozzler-worker-run.j2
dest: /etc/service/brozzler-worker/run
mode: 0755
notify:
- restart brozzler-worker
become: true

View File

@ -0,0 +1,14 @@
#!/bin/bash
cd /tmp
logfile=/var/log/Xvnc.log
touch $logfile
chown {{user}} $logfile
exec nice setuidgid {{user}} Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 \
>> $logfile 2>&1

View File

@ -1,14 +0,0 @@
description "Xvnc"
start on runlevel [2345]
stop on runlevel [!2345]
setuid {{user}}
console log
exec nice Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0

View File

@ -0,0 +1,17 @@
#!/bin/bash
logfile=/var/log/brozzler-worker.log
touch $logfile
chown {{user}} $logfile
source {{venv_root}}/brozzler-ve3/bin/activate
exec nice setuidgid {{user}} \
env DISPLAY=:1 LANG=en_US.UTF-8 LC_COLLATE=C \
brozzler-worker \
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
--max-browsers=4 \
--trace \
--warcprox-auto \
>> $logfile 2>&1

View File

@ -1,25 +0,0 @@
description "brozzler-worker"
start on runlevel [2345]
stop on runlevel [!2345]
env DISPLAY=:1
env PATH={{venv_root}}/brozzler-ve34/bin:/usr/bin:/bin
env PYTHONPATH={{venv_root}}/brozzler-ve34/lib/python3.4/site-packages
env LANG=C.UTF-8
setuid {{user}}
console log
# depends on vnc server
start on started Xvnc
stop on stopping Xvnc
kill timeout 60
exec nice brozzler-worker \
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
--max-browsers=4 \
--verbose \
--warcprox-auto

View File

@ -0,0 +1,10 @@
#!/bin/bash
logfile=/var/log/vnc-websock.log
touch $logfile
chown {{user}} $logfile
source /opt/websockify-ve3/bin/activate
exec nice setuidgid {{user}} websockify 0.0.0.0:8901 localhost:5901 >> $logfile 2>&1

View File

@ -1,15 +0,0 @@
description "vnc-websock"
start on runlevel [2345]
stop on runlevel [!2345]
setuid {{user}}
console log
env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin
# port 8901 is hard-coded in brozzler/dashboard/static/partials/workers.html
exec nice websockify 0.0.0.0:8901 localhost:5901

View File

@ -1,44 +1,74 @@
---
# get latest pip (had problems with version from apt-get, specifically
# "pip install pyopenssl" did not install the dependency "cryptography")
# http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
- name: install setuptools for python 2 and 3
- apt:
name:
- python3-setuptools
- python3-pip
- python3-virtualenv
- daemontools
- daemontools-run
state: present
update_cache: yes
cache_valid_time: 86400 # one day
become: true
apt: name={{item}} state=present
with_items:
- python-setuptools
- python3-setuptools
- name: download pip-9.0.1.tar.gz
get_url:
url: https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
dest: /tmp
checksum: sha1:57ff41e99cb01b6a1c2b0999161589b726f0ec8b
- name: extract pip-9.0.1.tar.gz
unarchive: src=/tmp/pip-9.0.1.tar.gz dest=/tmp copy=no
# # get recent virtualenv, which bundles a recent pip
# - find:
# paths:
# - /usr/local/lib/python3.4/dist-packages
# - /usr/local/lib/python3.5/dist-packages
# recurse: true
# patterns: virtualenv.py
# contains: '__version__ = "16.4.3"'
# register: virtualenv_py_16_4_3
#
# - command: mktemp -d
# register: mktempd_out
# when: virtualenv_py_16_4_3.matched == 0
#
# - name: download virtualenv-16.4.3
# get_url:
# url: https://files.pythonhosted.org/packages/37/db/89d6b043b22052109da35416abc3c397655e4bd3cff031446ba02b9654fa/virtualenv-16.4.3.tar.gz
# dest: '{{mktempd_out.stdout}}'
# checksum: sha256:984d7e607b0a5d1329425dd8845bd971b957424b5ba664729fab51ab8c11bc39
# when: virtualenv_py_16_4_3.matched == 0
#
# - name: extract virtualenv-16.4.3.tar.gz
# unarchive:
# src: '{{mktempd_out.stdout}}/virtualenv-16.4.3.tar.gz'
# dest: '{{mktempd_out.stdout}}'
# copy: no
# when: virtualenv_py_16_4_3.matched == 0
#
# - name: run "python3 setup.py install" in {{mktempd_out.stdout}}/virtualenv-16.4.3
# become: true
# command: python3 setup.py install
# args:
# chdir: '{{mktempd_out.stdout}}/virtualenv-16.4.3'
# when: virtualenv_py_16_4_3.matched == 0
#
# - file:
# path: '{{mktempd_out.stdout}}'
# state: absent
# become: true
# when: virtualenv_py_16_4_3.matched == 0
# this clause is a workaround for travis-ci, which only wants to install in /usr
# see https://travis-ci.org/internetarchive/brozzler/builds/174338601
# but it complains that /usr/lib/python3.4/site-packages doesn't exist
# but it complains that /usr/lib/python3.5/site-packages doesn't exist
# see https://travis-ci.org/internetarchive/brozzler/builds/174094831
- file: path={{item}} state=directory
- file:
path: '{{item}}'
state: directory
with_items:
- /usr/lib/python3.4/site-packages
- /usr/lib/python3.4/dist-packages
- /usr/lib/python3.5/site-packages
- /usr/lib/python3.5/dist-packages
become: true
- name: run "python3 setup.py install" in /tmp/pip-9.0.1
command: python3 setup.py install
chdir=/tmp/pip-9.0.1
creates=/usr/local/lib/python3.4/dist-packages/pip-9.0.1-py3.4.egg/pip/__init__.py
become: true
- name: run "pip install virtualenv"
command: pip install virtualenv
creates=/usr/local/lib/python3.4/dist-packages/virtualenv.py
become: true
- command: id {{user}}
register: id_user
ignore_errors: true
changed_when: false
- name: ensure service user {{user}} exists
user: name={{user}} system=yes createhome=no home=/nonexistent
shell=/usr/sbin/nologin

View File

@ -1,5 +1,9 @@
---
- name: restart pywb
service: name=pywb state=restarted
svc:
name: pywb
state: restarted
service_dir: /etc/service
become: true

View File

@ -1,36 +1,52 @@
---
- name: mkdir {{venv_root}}/pywb-ve34
file: path={{venv_root}}/pywb-ve34 state=directory
- name: mkdir {{venv_root}}/pywb-ve3
file: path={{venv_root}}/pywb-ve3 state=directory
owner={{user}}
become: true
- name: install pywb in virtualenv
pip: name=pywb
version=0.33.2
virtualenv={{venv_root}}/pywb-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
pip:
name: pywb
version: 0.33.2
virtualenv: '{{venv_root}}/pywb-ve3'
virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true
become_user: '{{user}}'
notify:
- restart pywb
- name: install brozzler in pywb virtualenv
pip: name='{{brozzler_pip_name}}'
virtualenv={{venv_root}}/pywb-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
pip:
name: '{{brozzler_pip_name}}'
virtualenv: '{{venv_root}}/pywb-ve3'
virtualenv_python: python3
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
become: true
become_user: '{{user}}'
notify:
- restart pywb
- name: pywb config file /etc/pywb.yml
template: src=templates/pywb.yml.j2
dest=/etc/pywb.yml
become: true
notify:
- restart pywb
- name: upstart config file /etc/init/pywb.conf
template: src=templates/pywb.conf.j2
dest=/etc/init/pywb.conf
- name: mkdir /etc/service/pywb
file:
path: /etc/service/pywb
state: directory
become: true
- name: install /etc/service/pywb/run
template:
src: templates/pywb-run.j2
dest: /etc/service/pywb/run
mode: 0755
notify:
- restart pywb
become: true

View File

@ -0,0 +1,10 @@
#!/bin/bash
logfile=/var/log/pywb.log
touch $logfile
chown {{user}} $logfile
exec nice setuidgid {{user}} env PYWB_CONFIG_FILE=/etc/pywb.yml \
{{venv_root}}/pywb-ve3/bin/python {{venv_root}}/pywb-ve3/bin/brozzler-wayback \
>> $logfile 2>&1

View File

@ -1,14 +0,0 @@
description "pywb"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH={{venv_root}}/pywb-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/pywb-ve34/bin:/usr/bin:/bin
env PYWB_CONFIG_FILE=/etc/pywb.yml
setuid {{user}}
console log
exec nice brozzler-wayback

View File

@ -3,8 +3,9 @@
apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
become: true
- name: ensure rethinkdb repo is in apt sources.list
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
state=present
apt_repository:
repo: 'deb http://download.rethinkdb.com/apt {{ansible_lsb.codename|lower}} main'
state: present
become: true
- apt: update_cache=yes
become: true

View File

@ -1,4 +1,7 @@
---
- name: restart warcprox
service: name=warcprox state=restarted
svc:
name: warcprox
state: restarted
service_dir: /etc/service
become: true

View File

@ -4,26 +4,37 @@
apt: name={{item}} state=present
with_items:
- gcc
- python3.4
- libpython3.4-dev
- python3-dev
- libffi-dev
- libssl-dev
- tor
- git
- name: mkdir {{venv_root}}/warcprox-ve34
- name: mkdir {{venv_root}}/warcprox-ve3
become: true
file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}}
file: path={{venv_root}}/warcprox-ve3 state=directory owner={{user}}
- name: install warcprox in virtualenv
pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox
virtualenv={{venv_root}}/warcprox-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
pip:
name: git+https://github.com/internetarchive/warcprox.git#egg=warcprox
virtualenv: '{{venv_root}}/warcprox-ve3'
virtualenv_python: python3
extra_args: --no-input --upgrade --pre --cache-dir=/tmp/pip-cache
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
become: true
become_user: '{{user}}'
notify:
- restart warcprox
- name: install upstart config /etc/init/warcprox.conf
- name: mkdir /etc/service/warcprox
file:
path: /etc/service/warcprox
state: directory
become: true
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
- name: install /etc/service/warcprox/run
template:
src: templates/run.j2
dest: /etc/service/warcprox/run
mode: 0755
notify:
- restart warcprox
become: true

View File

@ -1,19 +1,16 @@
description "warcprox"
#!/bin/bash
start on runlevel [2345]
stop on runlevel [!2345]
logfile=/var/log/warcprox.log
touch $logfile
chown {{user}} $logfile
env PYTHONPATH={{venv_root}}/warcprox-ve34/lib/python3.4/site-packages
env PATH={{venv_root}}/warcprox-ve34/bin:/usr/bin:/bin
ulimit -n 4096
# by default warcprox creates some files/dirs relative to cwd
chdir {{work_dir}}
setuid {{user}}
cd {{work_dir}}
console log
source {{venv_root}}/warcprox-ve3/bin/activate
# --profile
exec nice warcprox \
exec nice -n5 setuidgid {{user}} env LANG=en_US.UTF-8 LC_COLLATE=C warcprox \
--address=0.0.0.0 \
--dir={{warcs_dir}} \
--base32 \
@ -22,4 +19,6 @@ exec nice warcprox \
--onion-tor-socks-proxy=localhost:9050 \
--rethinkdb-services-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/services \
--rethinkdb-stats-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/stats \
--rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures
--rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures \
>> $logfile 2>&1

View File

@ -162,7 +162,7 @@ class ThreadExceptionGate:
def queue_exception(self, e):
with self.lock:
if self.pending_exception:
self.logger.warn(
self.logger.warning(
'%r already pending for thread %r, discarding %r',
self.pending_exception, self.thread, e)
else:

View File

@ -223,7 +223,7 @@ class Chrome:
raise
except Exception as e:
if time.time() - self._last_warning > 30:
self.logger.warn(
self.logger.warning(
'problem with %s (will keep trying until timeout '
'of %d seconds): %s', json_url, timeout_sec, e)
self._last_warning = time.time()
@ -294,7 +294,7 @@ class Chrome:
'chrome pid %s exited normally',
self.chrome_process.pid)
else:
self.logger.warn(
self.logger.warning(
'chrome pid %s exited with nonzero status %s',
self.chrome_process.pid, status)
@ -305,13 +305,13 @@ class Chrome:
return
time.sleep(0.5)
self.logger.warn(
self.logger.warning(
'chrome pid %s still alive %.1f seconds after sending '
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
time.time() - t0)
os.killpg(self.chrome_process.pid, signal.SIGKILL)
status = self.chrome_process.wait()
self.logger.warn(
self.logger.warning(
'chrome pid %s reaped (status=%s) after killing with '
'SIGKILL', self.chrome_process.pid, status)

View File

@ -2,7 +2,7 @@
'''
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2017 Internet Archive
Copyright (C) 2014-2019 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -606,6 +606,10 @@ def brozzler_purge(argv=None):
'--site', dest='site', metavar='SITE_ID', help=(
'purge crawl state from rethinkdb for a site, including all '
'pages'))
group.add_argument(
'--finished-before', dest='finished_before', metavar='YYYY-MM-DD',
help=('purge crawl state from rethinkdb for a jobs that ended '
'before this date'))
arg_parser.add_argument(
'--force', dest='force', action='store_true', help=(
'purge even if job or site is still has status ACTIVE'))
@ -628,7 +632,7 @@ def brozzler_purge(argv=None):
sys.exit(1)
if job.status == 'ACTIVE':
if args.force:
logging.warn(
logging.warning(
'job %s has status ACTIVE, purging anyway because '
'--force was supplied', job_id)
else:
@ -645,7 +649,7 @@ def brozzler_purge(argv=None):
sys.exit(1)
if site.status == 'ACTIVE':
if args.force:
logging.warn(
logging.warning(
'site %s has status ACTIVE, purging anyway because '
'--force was supplied', site_id)
else:
@ -654,6 +658,20 @@ def brozzler_purge(argv=None):
'(override with --force)', site_id)
sys.exit(1)
_purge_site(rr, site_id)
elif args.finished_before:
finished_before = datetime.datetime.strptime(
args.finished_before, '%Y-%m-%d').replace(
tzinfo=doublethink.UTC)
reql = rr.table('jobs').filter(
r.row['finished'].default(r.maxval).lt(finished_before).or_(
r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before)))
logging.debug(
'retrieving jobs older than %s: %s', finished_before, reql)
for job in reql.run():
# logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s',
# job['id'], job.get('finished'),
# job.get('starts_and_stops', [{'stop':None}])[-1]['stop'])
_purge_job(rr, job['id'])
def _purge_site(rr, site_id):
reql = rr.table('pages').between(
@ -713,7 +731,7 @@ def brozzler_list_captures(argv=None):
if args.url_or_sha1[:5] == 'sha1:':
if args.prefix:
logging.warn(
logging.warning(
'ignoring supplied --prefix option which does not apply '
'to lookup by sha1')
# assumes it's already base32 (XXX could detect if hex and convert)

View File

@ -260,7 +260,7 @@ class BrozzlerEasyController:
state_strs.append(str(th))
stack = traceback.format_stack(sys._current_frames()[th.ident])
state_strs.append(''.join(stack))
logging.warn('dumping state (caught signal {})\n{}'.format(
logging.warning('dumping state (caught signal {})\n{}'.format(
signum, '\n'.join(state_strs)))
def main(argv=None):

View File

@ -138,7 +138,7 @@ class RethinkDbFrontier:
sites = []
for i in range(result["replaced"]):
if result["changes"][i]["old_val"]["claimed"]:
self.logger.warn(
self.logger.warning(
"re-claimed site that was still marked 'claimed' "
"because it was last claimed a long time ago "
"at %s, and presumably some error stopped it from "
@ -225,7 +225,7 @@ class RethinkDbFrontier:
if not job:
return False
if job.status.startswith("FINISH"):
self.logger.warn("%s is already %s", job, job.status)
self.logger.warning("%s is already %s", job, job.status)
return True
results = self.rr.table("sites").get_all(job_id, index="job_id").run()
@ -415,7 +415,7 @@ class RethinkDbFrontier:
assert isinstance(e, brozzler.ReachedLimit)
if (site.reached_limit
and site.reached_limit != e.warcprox_meta["reached-limit"]):
self.logger.warn(
self.logger.warning(
"reached limit %s but site had already reached limit %s",
e.warcprox_meta["reached-limit"], self.reached_limit)
else:
@ -434,7 +434,7 @@ class RethinkDbFrontier:
index="priority_by_site").filter({"hops_from_seed":0}).run()
pages = list(results)
if len(pages) > 1:
self.logger.warn(
self.logger.warning(
"more than one seed page for site_id %s ?", site_id)
if len(pages) < 1:
return None

View File

@ -106,7 +106,7 @@ def is_permitted_by_robots(site, url, proxy=None):
# reppy has wrapped an exception that we want to bubble up
raise brozzler.ProxyError(e)
else:
logging.warn(
logging.warning(
"returning true (permitted) after problem fetching "
"robots.txt for %r: %r", url, e)
return True

View File

@ -147,13 +147,13 @@ class BrozzlerWorker:
try:
with urllib.request.urlopen(request, timeout=600) as response:
if response.getcode() != 204:
self.logger.warn(
self.logger.warning(
'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)',
response.getcode(), response.reason)
return request, response
except urllib.error.HTTPError as e:
self.logger.warn(
self.logger.warning(
'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)',
e.getcode(), e.info())
@ -370,7 +370,7 @@ class BrozzlerWorker:
if (page.needs_robots_check and
not brozzler.is_permitted_by_robots(
site, page.url, self._proxy_for(site))):
logging.warn("page %s is blocked by robots.txt", page.url)
logging.warning("page %s is blocked by robots.txt", page.url)
page.blocked_by_robots = True
self._frontier.completed_page(site, page)
else:
@ -544,7 +544,7 @@ class BrozzlerWorker:
def start(self):
with self._start_stop_lock:
if self._thread:
self.logger.warn(
self.logger.warning(
'ignoring start request because self._thread is '
'not None')
return

View File

@ -48,7 +48,7 @@ _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_rea
def _webpage_read_content(self, *args, **kwargs):
content = _orig_webpage_read_content(self, *args, **kwargs)
if len(content) > 20000000:
logging.warn(
logging.warning(
'bypassing youtube-dl extraction because content is '
'too large (%s characters)', len(content))
return ''
@ -185,7 +185,7 @@ def _build_youtube_dl(worker, destdir, site):
mimetype = magic.from_file(ctx['filename'], mime=True)
except ImportError as e:
mimetype = 'video/%s' % info_dict['ext']
self.logger.warn(
self.logger.warning(
'guessing mimetype %s because %r', mimetype, e)
url = 'youtube-dl:%05d:%s' % (

View File

@ -339,12 +339,12 @@ Brozzler derives its general approach to the seed surt from `heritrix
slash.
2. Canonicalization does not attempt to match heritrix exactly, though it
usually does match.
3. When generating a SURT for an HTTPS URL, heritrix changes the scheme to
HTTP. For example, the heritrix SURT for ``https://www.example.com/`` is
``http://(com,example,www,)`` and this means that all of
``http://www.example.com/*`` and ``https://www.example.com/*`` are in
scope. It also means that a manually specified SURT with scheme "https" does
not match anything. Brozzler does no scheme munging.
3. Brozzler does no scheme munging. (When generating a SURT for an HTTPS URL,
heritrix changes the scheme to HTTP. For example, the heritrix SURT for
``https://www.example.com/`` is ``http://(com,example,www,)`` and this means
that all of ``http://www.example.com/*`` and ``https://www.example.com/*``
are in scope. It also means that a manually specified SURT with scheme
"https" does not match anything.)
4. Brozzler identifies seed "redirects" by retrieving the URL from the
browser's location bar at the end of brozzling the seed page, whereas
heritrix follows HTTP 3XX redirects. If the URL in the browser

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.5.4',
version='1.5.6',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -64,10 +64,10 @@ setuptools.setup(
],
},
install_requires=[
'PyYAML>=3.12',
'PyYAML>=5.1',
'youtube-dl>=2018.7.21',
'reppy==0.3.4',
'requests>=2.18.4',
'requests>=2.21',
'websocket-client>=0.39.0,<=0.48.0',
'pillow>=5.2.0',
'urlcanon>=0.1.dev23',
@ -80,13 +80,13 @@ setuptools.setup(
],
extras_require={
'dashboard': [
'flask>=0.11',
'flask>=1.0',
'gunicorn>=19.8.1'
],
'easy': [
'warcprox>=2.4b2.dev173',
'pywb>=0.33.2,<2',
'flask>=0.11',
'flask>=1.0',
'gunicorn>=19.8.1'
],
},

View File

@ -67,8 +67,8 @@ def httpd(request):
self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
self.send_header('Content-type', 'text/html')
self.end_headers()
self.wfile.write(self.headers.getheader('Authorization'))
self.wfile.write('not authenticated')
self.wfile.write(self.headers.get('Authorization', b''))
self.wfile.write(b'not authenticated')
else:
super().do_GET()

View File

@ -34,16 +34,41 @@ import http.server
import logging
import warcprox
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
def _local_address():
import socket
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
try:
s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
return s.getsockname()[0]
except:
return '127.0.0.1'
finally:
s.close()
local_address = _local_address()
def start_service(service):
subprocess.check_call(['sudo', 'service', service, 'start'])
subprocess.check_call(['sudo', 'svc', '-u', '/etc/service/' + service])
def stop_service(service):
subprocess.check_call(['sudo', 'service', service, 'stop'])
subprocess.check_call(['sudo', 'svc', '-d', '/etc/service/' + service])
while True:
status = subprocess.check_output(
['sudo', 'svstat', '/etc/service/' + service])
if b' down ' in status:
break
time.sleep(0.5)
@pytest.fixture(scope='module')
def httpd(request):
class RequestHandler(http.server.SimpleHTTPRequestHandler):
def do_POST(self):
logging.info('\n%s\n%s', self.requestline, self.headers)
self.do_GET()
def do_GET(self):
logging.info('\n%s\n%s', self.requestline, self.headers)
if self.path == '/site5/redirect/':
self.send_response(303, 'See other')
self.send_header('Connection', 'close')
@ -82,7 +107,7 @@ def httpd(request):
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
httpd = http.server.HTTPServer((local_address, 0), RequestHandler)
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
httpd_thread.start()
@ -94,6 +119,9 @@ def httpd(request):
return httpd
def make_url(httpd, rel_url):
return 'http://%s:%s%s' % (local_address, httpd.server_port, rel_url)
def test_httpd(httpd):
'''
Tests that our http server is working as expected, and that two fetches
@ -101,7 +129,7 @@ def test_httpd(httpd):
deduplication.
'''
payload1 = content2 = None
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
url = make_url(httpd, '/site1/file1.txt')
with urllib.request.urlopen(url) as response:
assert response.status == 200
payload1 = response.read()
@ -140,13 +168,13 @@ def test_brozzle_site(httpd):
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
'seed': make_url(httpd, '/site1/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/site1/' % httpd.server_port
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
page1 = make_url(httpd, '/site1/')
page2 = make_url(httpd, '/site1/file1.txt')
robots = make_url(httpd, '/robots.txt')
# so we can examine rethinkdb before it does anything
try:
@ -171,8 +199,7 @@ def test_brozzle_site(httpd):
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/site1/' % httpd.server_port,
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
make_url(httpd, '/site1/'), make_url(httpd, '/site1/file1.txt')}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
@ -255,8 +282,8 @@ def test_proxy_non_warcprox(httpd):
start_service('brozzler-worker')
assert len(proxy.requests) <= 15
assert proxy.requests.count('GET /status') == 1
assert ('GET http://localhost:%s/site1/' % httpd.server_port) in proxy.requests
assert ('GET http://localhost:%s/site1/file1.txt' % httpd.server_port) in proxy.requests
assert ('GET %s' % make_url(httpd, '/site1/')) in proxy.requests
assert ('GET %s' % make_url(httpd, '/site1/file1.txt')) in proxy.requests
assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == []
proxy.shutdown()
@ -292,14 +319,14 @@ def _test_proxy_setting(
datetime.datetime.utcnow().isoformat())
# the two pages we expect to be crawled
page1 = 'http://localhost:%s/site1/' % httpd.server_port
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
page1 = make_url(httpd, '/site1/')
page2 = make_url(httpd, '/site1/file1.txt')
robots = make_url(httpd, '/robots.txt')
rr = doublethink.Rethinker('localhost', db='brozzler')
service_registry = doublethink.ServiceRegistry(rr)
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
'seed': make_url(httpd, '/site1/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
assert site.id is None
frontier = brozzler.RethinkDbFrontier(rr)
@ -332,8 +359,8 @@ def _test_proxy_setting(
pages = list(frontier.site_pages(site.id))
assert len(pages) == 2
assert {page.url for page in pages} == {
'http://localhost:%s/site1/' % httpd.server_port,
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
make_url(httpd, '/site1/'),
make_url(httpd, '/site1/file1.txt')}
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
@ -360,7 +387,7 @@ def test_obey_robots(httpd):
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
'seed': make_url(httpd, '/site1/'),
'user_agent': 'im a badbot', # robots.txt blocks badbot
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -390,12 +417,12 @@ def test_obey_robots(httpd):
pages = list(frontier.site_pages(site.id))
assert len(pages) == 1
page = pages[0]
assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
assert page.url == make_url(httpd, '/site1/')
assert page.blocked_by_robots
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
robots_url = make_url(httpd, '/robots.txt')
captures = list(rr.table('captures').filter({'test_id':test_id}).run())
assert len(captures) == 1
assert captures[0]['url'] == robots_url
@ -412,7 +439,7 @@ def test_login(httpd):
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site2/' % httpd.server_port,
'seed': make_url(httpd, '/site2/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
'username': 'test_username', 'password': 'test_password'})
@ -428,7 +455,7 @@ def test_login(httpd):
# take a look at the captures table
time.sleep(2) # in case warcprox hasn't finished processing urls
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
robots_url = make_url(httpd, '/robots.txt')
captures = list(rr.table('captures').filter(
{'test_id':test_id}).order_by('timestamp').run())
meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
@ -436,25 +463,25 @@ def test_login(httpd):
# there are several forms in in htdocs/site2/login.html but only one
# that brozzler's heuristic should match and try to submit, and it has
# action='00', so we can check for that here
assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url
assert ('POST %s' % make_url(httpd, '/site2/00')) in meth_url
# sanity check the rest of the crawl
assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
assert ('GET %s' % make_url(httpd, '/robots.txt')) in meth_url
assert ('GET %s' % make_url(httpd, '/site2/')) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/')) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/')) in meth_url
assert ('GET %s' % make_url(httpd, '/site2/login.html')) in meth_url
assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/login.html')) in meth_url
assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/login.html')) in meth_url
def test_seed_redirect(httpd):
test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
seed_url = make_url(httpd, '/site5/redirect/')
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
'seed': make_url(httpd, '/site5/redirect/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]}
assert site.scope == {'accepts': [{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}]}
frontier = brozzler.RethinkDbFrontier(rr)
brozzler.new_site(frontier, site)
@ -473,19 +500,19 @@ def test_seed_redirect(httpd):
pages.sort(key=lambda page: page.hops_from_seed)
assert pages[0].hops_from_seed == 0
assert pages[0].url == seed_url
assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
assert pages[0].redirect_url == make_url(httpd, '/site5/destination/')
assert pages[1].hops_from_seed == 1
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
assert pages[1].url == make_url(httpd, '/site5/destination/page2.html')
# check that scope has been updated properly
assert site.scope == {'accepts': [
{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port},
{'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}
{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)},
{'ssurt': '%s//%s:http:/site5/destination/' % (local_address, httpd.server_port)}]}
def test_hashtags(httpd):
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site7/' % httpd.server_port
seed_url = make_url(httpd, '/site7/')
site = brozzler.Site(rr, {
'seed': seed_url,
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -507,9 +534,9 @@ def test_hashtags(httpd):
assert pages[0].url == seed_url
assert pages[0].hops_from_seed == 0
assert pages[0].brozzle_count == 1
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site7/foo.html')]
assert not pages[0].hashtags
assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
assert pages[1].url == make_url(httpd, '/site7/foo.html')
assert pages[1].hops_from_seed == 1
assert pages[1].brozzle_count == 1
assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
@ -520,18 +547,18 @@ def test_hashtags(httpd):
captures_by_url = {
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
assert seed_url in captures_by_url
assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
assert make_url(httpd, '/site7/foo.html') in captures_by_url
assert make_url(httpd, '/site7/whee.txt') in captures_by_url
assert make_url(httpd, '/site7/boosh.txt') in captures_by_url
assert 'screenshot:%s' % seed_url in captures_by_url
assert 'thumbnail:%s' % seed_url in captures_by_url
assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
assert 'screenshot:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
assert 'thumbnail:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
def test_redirect_hashtags(httpd):
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
rr = doublethink.Rethinker('localhost', db='brozzler')
seed_url = 'http://localhost:%s/site9/' % httpd.server_port
seed_url = make_url(httpd, '/site9/')
site = brozzler.Site(rr, {
'seed': seed_url,
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -553,9 +580,9 @@ def test_redirect_hashtags(httpd):
assert pages[0].url == seed_url
assert pages[0].hops_from_seed == 0
assert pages[0].brozzle_count == 1
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port]
assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site9/redirect.html')]
assert not pages[0].hashtags
assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port
assert pages[1].url == make_url(httpd, '/site9/redirect.html')
assert pages[1].hops_from_seed == 1
assert pages[1].brozzle_count == 1
assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]
@ -563,7 +590,7 @@ def test_redirect_hashtags(httpd):
time.sleep(2) # in case warcprox hasn't finished processing urls
# take a look at the captures table
captures = rr.table('captures').filter({'test_id':test_id}).run()
redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET']
redirect_captures = [c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET']
assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
# === expected captures ===
@ -589,9 +616,9 @@ def test_stop_crawl(httpd):
# create a new job with three sites that could be crawled forever
job_conf = {'seeds': [
{'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port},
{'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port},
{'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]}
{'url': make_url(httpd, '/infinite/foo/')},
{'url': make_url(httpd, '/infinite/bar/')},
{'url': make_url(httpd, '/infinite/baz/')}]}
job = brozzler.new_job(frontier, job_conf)
assert job.id
@ -675,7 +702,7 @@ def test_warcprox_outage_resiliency(httpd):
# put together a site to crawl
test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/infinite/' % httpd.server_port,
'seed': make_url(httpd, '/infinite/'),
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
try:
@ -684,7 +711,7 @@ def test_warcprox_outage_resiliency(httpd):
try:
stop_service('warcprox')
except Exception as e:
logging.warn('problem stopping warcprox service: %s', e)
logging.warning('problem stopping warcprox service: %s', e)
# queue the site for brozzling
brozzler.new_site(frontier, site)
@ -771,7 +798,7 @@ def test_time_limit(httpd):
# create a new job with one seed that could be crawled forever
job_conf = {'seeds': [{
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
'url': make_url(httpd, '/infinite/foo/'),
'time_limit': 20}]}
job = brozzler.new_job(frontier, job_conf)
assert job.id
@ -801,7 +828,7 @@ def test_ydl_stitching(httpd):
rr = doublethink.Rethinker('localhost', db='brozzler')
frontier = brozzler.RethinkDbFrontier(rr)
site = brozzler.Site(rr, {
'seed': 'http://localhost:%s/site10/' % httpd.server_port,
'seed': make_url(httpd, '/site10/'),
'warcprox_meta': {
'warc-prefix': 'test_ydl_stitching',
'captures-table-extra-fields': {'test_id':test_id}}})
@ -819,7 +846,7 @@ def test_ydl_stitching(httpd):
assert len(pages) == 1
page = pages[0]
assert len(page.videos) == 6
stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port
stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/')
assert {
'blame': 'youtube-dl',
'content-length': 267900,

View File

@ -24,27 +24,27 @@ the brozzler virtualenv.
::
my-laptop$ vagrant ssh
vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate
(brozzler-ve34)vagrant@brzl:~$
vagrant@brzl:~$ source /opt/brozzler-ve3/bin/activate
(brozzler-ve3)vagrant@brzl:~$
Then you can run brozzler-new-site:
::
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
::
(brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <<EOF
(brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
id: job1
proxy: localhost:8000 # point at warcprox for archiving
seeds:
- url: https://example.org/
EOF
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-job job1.yml
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-job job1.yml
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
./logs (via vagrant folders syncing).

4
vagrant/Vagrantfile vendored
View File

@ -1,8 +1,9 @@
Vagrant.configure(2) do |config|
config.vm.box = "ubuntu/trusty64"
config.vm.box = "ubuntu/xenial64"
config.vm.define "10.9.9.9"
config.vm.hostname = "brzl"
config.vm.network :private_network, ip: "10.9.9.9"
config.disksize.size = '50GB'
config.vm.synced_folder "..", "/brozzler"
@ -14,6 +15,7 @@ Vagrant.configure(2) do |config|
config.vm.provision "ansible" do |ansible|
ansible.inventory_path = "../ansible/hosts-vagrant"
ansible.playbook = "../ansible/playbook.yml"
# ansible.verbose = "-vvv"
end
config.vm.provider 'virtualbox' do |v|

View File

@ -10,12 +10,12 @@ cd $(dirname "${BASH_SOURCE[0]}")
vagrant up
echo service status:
vagrant ssh -- 'status warcprox ;
status Xvnc ;
status brozzler-worker ;
status brozzler-dashboard ;
status vnc-websock'
vagrant ssh -- 'sudo svstat /etc/service/warcprox ;
sudo svstat /etc/service/Xvnc ;
sudo svstat /etc/service/brozzler-worker ;
sudo svstat /etc/service/brozzler-dashboard ;
sudo svstat /etc/service/vnc-websock'
echo
vagrant ssh -- 'set -x ; source /opt/brozzler-ve34/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"'
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@"
vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest==4.3.0 && pip install --upgrade --pre "warcprox>=2.1b1.dev86"'
vagrant ssh -- "source /opt/brozzler-ve3/bin/activate && DISPLAY=:1 py.test --tb=native -v /brozzler/tests $@"

View File

@ -7,7 +7,7 @@ This is a standalone script with no dependencies other than python, and should
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
so we can use the argparse library.
Copyright (C) 2016 Internet Archive
Copyright (C) 2016-2019 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
@ -41,9 +41,8 @@ def main(argv=[]):
subprocess.call([
'vagrant', 'ssh', '--',
'f=`mktemp` && cat > $f && '
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
'/home/vagrant/brozzler-ve34/bin/python '
'/home/vagrant/brozzler-ve34/bin/brozzler-new-job $f'],
'/home/vagrant/brozzler-ve3/bin/python '
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
stdin=f)
if __name__ == '__main__':

View File

@ -74,11 +74,8 @@ def main(argv=[]):
os.chdir(os.path.dirname(__file__))
cmd = (
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
'/home/vagrant/brozzler-ve34/bin/python '
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
'--proxy=localhost:8000 %s %s') % (
' '.join(options), args.seed)
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
'%s %s') % (' '.join(options), args.seed)
subprocess.call(['vagrant', 'ssh', '--', cmd])
if __name__ == '__main__':