mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #149 from nlevitt/travis-py37
trying to make this work with xenial for travis
This commit is contained in:
commit
eb34bebb91
13
.travis.yml
13
.travis.yml
@ -11,19 +11,22 @@ before_install:
|
|||||||
- sudo pip install ansible==2.1.3.0
|
- sudo pip install ansible==2.1.3.0
|
||||||
install:
|
install:
|
||||||
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
||||||
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest
|
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest==4.3.0
|
||||||
- chromium-browser --version
|
- chromium-browser --version
|
||||||
- sudo apt-get update
|
- sudo apt-get update
|
||||||
- sudo apt-get install --only-upgrade chromium-browser
|
- sudo apt-get install --only-upgrade chromium-browser
|
||||||
- chromium-browser --version
|
- chromium-browser --version
|
||||||
- sudo service brozzler-worker restart
|
- ps ww -fHe
|
||||||
|
- sudo cat /var/log/Xvnc.log
|
||||||
|
- sudo cat /var/log/brozzler-worker.log
|
||||||
|
- sudo cat /var/log/warcprox.log
|
||||||
script:
|
script:
|
||||||
- DISPLAY=:1 py.test --tb=native -v tests
|
- DISPLAY=:1 py.test --tb=native -v tests
|
||||||
after_failure:
|
after_failure:
|
||||||
- chromium-browser --version
|
- chromium-browser --version
|
||||||
- sudo cat /var/log/upstart/warcprox.log
|
- sudo cat /var/log/warcprox.log
|
||||||
- sudo cat /var/log/upstart/brozzler-worker.log
|
- sudo cat /var/log/brozzler-worker.log
|
||||||
- sudo cat /var/log/upstart/pywb.log
|
- sudo cat /var/log/pywb.log
|
||||||
notifications:
|
notifications:
|
||||||
slack:
|
slack:
|
||||||
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=
|
secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=
|
||||||
|
@ -1,7 +1,9 @@
|
|||||||
[all:vars]
|
[all:vars]
|
||||||
warcs_dir=/vagrant/warcs
|
warcs_dir=/vagrant/warcs
|
||||||
brozzler_pip_name='-e /brozzler'
|
# brozzler_pip_name='-e /brozzler' # not working anymore? :(
|
||||||
|
brozzler_pip_name='/brozzler'
|
||||||
user=vagrant
|
user=vagrant
|
||||||
|
ansible_python_interpreter=/usr/bin/python3
|
||||||
### possible values for a prod deployment
|
### possible values for a prod deployment
|
||||||
# brozzler_pip_name=brozzler # get it from pypi
|
# brozzler_pip_name=brozzler # get it from pypi
|
||||||
# brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
# brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||||
|
@ -1,4 +1,8 @@
|
|||||||
---
|
---
|
||||||
- name: restart brozzler-dashboard
|
- name: restart brozzler-dashboard
|
||||||
service: name=brozzler-dashboard state=restarted
|
svc:
|
||||||
|
name: brozzler-dashboard
|
||||||
|
state: restarted
|
||||||
|
service_dir: /etc/service
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
|
@ -1,20 +1,33 @@
|
|||||||
---
|
---
|
||||||
- name: mkdir {{venv_root}}/brozzler-dashboard-ve34
|
- name: mkdir {{venv_root}}/brozzler-dashboard-ve3
|
||||||
file: path={{venv_root}}/brozzler-dashboard-ve34 state=directory
|
file: path={{venv_root}}/brozzler-dashboard-ve3 state=directory
|
||||||
owner={{user}}
|
owner={{user}}
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
- name: install brozzler[dashboard] in virtualenv
|
- name: install brozzler[dashboard] in virtualenv
|
||||||
pip: name='{{brozzler_pip_name}}[dashboard]'
|
pip:
|
||||||
virtualenv={{venv_root}}/brozzler-dashboard-ve34
|
name: '{{brozzler_pip_name}}[dashboard]'
|
||||||
virtualenv_python=python3.4
|
virtualenv: '{{venv_root}}/brozzler-dashboard-ve3'
|
||||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
virtualenv_python: python3
|
||||||
|
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
|
||||||
|
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
||||||
become: true
|
become: true
|
||||||
become_user: '{{user}}'
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart brozzler-dashboard
|
- restart brozzler-dashboard
|
||||||
- name: install upstart config /etc/init/brozzler-dashboard.conf
|
|
||||||
|
- name: mkdir /etc/service/brozzler-dashboard
|
||||||
|
file:
|
||||||
|
path: /etc/service/brozzler-dashboard
|
||||||
|
state: directory
|
||||||
become: true
|
become: true
|
||||||
template: src=templates/brozzler-dashboard.conf.j2
|
|
||||||
dest=/etc/init/brozzler-dashboard.conf
|
- name: install /etc/service/brozzler-dashboard/run
|
||||||
|
template:
|
||||||
|
src: templates/brozzler-dashboard-run.j2
|
||||||
|
dest: /etc/service/brozzler-dashboard/run
|
||||||
|
mode: 0755
|
||||||
notify:
|
notify:
|
||||||
- restart brozzler-dashboard
|
- restart brozzler-dashboard
|
||||||
|
become: true
|
||||||
|
|
||||||
|
@ -0,0 +1,15 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
logfile=/var/log/brozzler-dashboard.log
|
||||||
|
touch $logfile
|
||||||
|
chown {{user}} $logfile
|
||||||
|
|
||||||
|
source /opt/brozzler-dashboard-ve3/bin/activate
|
||||||
|
|
||||||
|
exec nice setuidgid {{user}} \
|
||||||
|
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler \
|
||||||
|
RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} \
|
||||||
|
RETHINKDB_DB=brozzler LANG=en_US.UTF-8 LC_COLLATE=C \
|
||||||
|
gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app \
|
||||||
|
>> $logfile 2>&1
|
||||||
|
|
@ -1,18 +0,0 @@
|
|||||||
description "brozzler-dashboard"
|
|
||||||
|
|
||||||
start on runlevel [2345]
|
|
||||||
stop on runlevel [!2345]
|
|
||||||
|
|
||||||
env PYTHONPATH={{venv_root}}/brozzler-dashboard-ve34/lib/python3.4/site-packages
|
|
||||||
env PATH={{venv_root}}/brozzler-dashboard-ve34/bin:/usr/bin:/bin
|
|
||||||
env LC_ALL=C.UTF-8
|
|
||||||
|
|
||||||
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
|
|
||||||
env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
|
|
||||||
env RETHINKDB_DB=brozzler
|
|
||||||
|
|
||||||
setuid {{user}}
|
|
||||||
|
|
||||||
console log
|
|
||||||
|
|
||||||
exec gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app
|
|
@ -1,13 +1,22 @@
|
|||||||
---
|
---
|
||||||
- name: restart Xvnc
|
- name: restart Xvnc
|
||||||
service: name=Xvnc state=restarted
|
svc:
|
||||||
become: true
|
name: Xvnc
|
||||||
- name: restart websockify
|
state: restarted
|
||||||
service: name=websockify state=restarted
|
service_dir: /etc/service
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
- name: restart vnc-websock
|
- name: restart vnc-websock
|
||||||
service: name=vnc-websock state=restarted
|
svc:
|
||||||
|
name: vnc-websock
|
||||||
|
state: restarted
|
||||||
|
service_dir: /etc/service
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
- name: restart brozzler-worker
|
- name: restart brozzler-worker
|
||||||
service: name=brozzler-worker state=restarted
|
svc:
|
||||||
|
name: brozzler-worker
|
||||||
|
state: restarted
|
||||||
|
service_dir: /etc/service
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
|
@ -3,14 +3,22 @@
|
|||||||
apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
|
apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
|
||||||
state=present
|
state=present
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
- apt: update_cache=yes
|
- apt: update_cache=yes
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
- name: ensure required packages are installed
|
- name: ensure required packages are installed
|
||||||
become: true
|
become: true
|
||||||
apt: name={{item}} state=present
|
apt: name={{item}} state=present
|
||||||
with_items:
|
with_items:
|
||||||
- vnc4server
|
|
||||||
- chromium-browser
|
- chromium-browser
|
||||||
|
- vnc4server
|
||||||
|
- libjpeg-turbo8-dev
|
||||||
|
- zlib1g-dev
|
||||||
|
- gcc
|
||||||
|
- python3-dev
|
||||||
|
- python3-dbg
|
||||||
|
- adobe-flashplugin
|
||||||
- xfonts-base
|
- xfonts-base
|
||||||
- fonts-arphic-bkai00mp
|
- fonts-arphic-bkai00mp
|
||||||
- fonts-arphic-bsmi00lp
|
- fonts-arphic-bsmi00lp
|
||||||
@ -24,51 +32,74 @@
|
|||||||
- fonts-sil-padauk
|
- fonts-sil-padauk
|
||||||
- fonts-unfonts-extra
|
- fonts-unfonts-extra
|
||||||
- fonts-unfonts-core
|
- fonts-unfonts-core
|
||||||
- ttf-indic-fonts
|
- fonts-indic
|
||||||
- fonts-thai-tlwg
|
- fonts-thai-tlwg
|
||||||
- fonts-lklug-sinhala
|
- fonts-lklug-sinhala
|
||||||
- git
|
|
||||||
- libjpeg-turbo8-dev
|
- name: mkdir /etc/service/{Xvnc,vnc-websock,brozzler-worker}
|
||||||
- zlib1g-dev
|
file:
|
||||||
- gcc
|
path: '/etc/service/{{item}}'
|
||||||
- g++
|
state: directory
|
||||||
- libpython3.4-dev
|
with_items:
|
||||||
- adobe-flashplugin
|
- Xvnc
|
||||||
- name: install Xvnc upstart config /etc/init/Xvnc.conf
|
- vnc-websock
|
||||||
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
|
- brozzler-worker
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
|
- name: install /etc/service/Xvnc/run
|
||||||
|
template:
|
||||||
|
src: templates/Xvnc-run.j2
|
||||||
|
dest: /etc/service/Xvnc/run
|
||||||
|
mode: 0755
|
||||||
notify:
|
notify:
|
||||||
- restart Xvnc
|
- restart Xvnc
|
||||||
- name: mkdir {{venv_root}}/websockify-ve34
|
|
||||||
become: true
|
become: true
|
||||||
file: path={{venv_root}}/websockify-ve34 state=directory owner={{user}}
|
|
||||||
|
- name: mkdir {{venv_root}}/websockify-ve3
|
||||||
|
become: true
|
||||||
|
file: path={{venv_root}}/websockify-ve3 state=directory owner={{user}}
|
||||||
|
|
||||||
- name: install websockify in virtualenv
|
- name: install websockify in virtualenv
|
||||||
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
|
pip:
|
||||||
virtualenv={{venv_root}}/websockify-ve34
|
name: git+https://github.com/kanaka/websockify.git#egg=websockify
|
||||||
virtualenv_python=python3.4
|
virtualenv: '{{venv_root}}/websockify-ve3'
|
||||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
virtualenv_python: python3
|
||||||
|
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
|
||||||
|
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
||||||
become: true
|
become: true
|
||||||
become_user: '{{user}}'
|
become_user: '{{user}}'
|
||||||
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
|
|
||||||
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
|
- name: install /etc/service/vnc-websock/run
|
||||||
become: true
|
template:
|
||||||
|
src: templates/vnc-websock-run.j2
|
||||||
|
dest: /etc/service/vnc-websock/run
|
||||||
|
mode: 0755
|
||||||
notify:
|
notify:
|
||||||
- restart vnc-websock
|
- restart vnc-websock
|
||||||
- name: mkdir {{venv_root}}/brozzler-ve34
|
|
||||||
become: true
|
become: true
|
||||||
file: path={{venv_root}}/brozzler-ve34 state=directory owner={{user}}
|
|
||||||
|
- name: mkdir {{venv_root}}/brozzler-ve3
|
||||||
|
become: true
|
||||||
|
file: path={{venv_root}}/brozzler-ve3 state=directory owner={{user}}
|
||||||
|
|
||||||
- name: install brozzler in virtualenv
|
- name: install brozzler in virtualenv
|
||||||
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
pip:
|
||||||
name='{{brozzler_pip_name}}'
|
name: '{{brozzler_pip_name}}'
|
||||||
virtualenv={{venv_root}}/brozzler-ve34
|
virtualenv: '{{venv_root}}/brozzler-ve3'
|
||||||
virtualenv_python=python3.4
|
virtualenv_python: python3
|
||||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
|
||||||
|
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
||||||
become: true
|
become: true
|
||||||
become_user: '{{user}}'
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart brozzler-worker
|
- restart brozzler-worker
|
||||||
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
|
|
||||||
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
|
- name: install /etc/service/brozzler-worker/run
|
||||||
become: true
|
template:
|
||||||
|
src: templates/brozzler-worker-run.j2
|
||||||
|
dest: /etc/service/brozzler-worker/run
|
||||||
|
mode: 0755
|
||||||
notify:
|
notify:
|
||||||
- restart brozzler-worker
|
- restart brozzler-worker
|
||||||
|
become: true
|
||||||
|
|
||||||
|
14
ansible/roles/brozzler-worker/templates/Xvnc-run.j2
Normal file
14
ansible/roles/brozzler-worker/templates/Xvnc-run.j2
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
cd /tmp
|
||||||
|
|
||||||
|
logfile=/var/log/Xvnc.log
|
||||||
|
touch $logfile
|
||||||
|
chown {{user}} $logfile
|
||||||
|
|
||||||
|
exec nice setuidgid {{user}} Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
|
||||||
|
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
|
||||||
|
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
|
||||||
|
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 \
|
||||||
|
>> $logfile 2>&1
|
||||||
|
|
@ -1,14 +0,0 @@
|
|||||||
description "Xvnc"
|
|
||||||
|
|
||||||
start on runlevel [2345]
|
|
||||||
stop on runlevel [!2345]
|
|
||||||
|
|
||||||
setuid {{user}}
|
|
||||||
|
|
||||||
console log
|
|
||||||
|
|
||||||
exec nice Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
|
|
||||||
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
|
|
||||||
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
|
|
||||||
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0
|
|
||||||
|
|
@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
logfile=/var/log/brozzler-worker.log
|
||||||
|
touch $logfile
|
||||||
|
chown {{user}} $logfile
|
||||||
|
|
||||||
|
source {{venv_root}}/brozzler-ve3/bin/activate
|
||||||
|
|
||||||
|
exec nice setuidgid {{user}} \
|
||||||
|
env DISPLAY=:1 LANG=en_US.UTF-8 LC_COLLATE=C \
|
||||||
|
brozzler-worker \
|
||||||
|
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
||||||
|
--max-browsers=4 \
|
||||||
|
--trace \
|
||||||
|
--warcprox-auto \
|
||||||
|
>> $logfile 2>&1
|
||||||
|
|
@ -1,25 +0,0 @@
|
|||||||
description "brozzler-worker"
|
|
||||||
|
|
||||||
start on runlevel [2345]
|
|
||||||
stop on runlevel [!2345]
|
|
||||||
|
|
||||||
env DISPLAY=:1
|
|
||||||
env PATH={{venv_root}}/brozzler-ve34/bin:/usr/bin:/bin
|
|
||||||
env PYTHONPATH={{venv_root}}/brozzler-ve34/lib/python3.4/site-packages
|
|
||||||
env LANG=C.UTF-8
|
|
||||||
|
|
||||||
setuid {{user}}
|
|
||||||
|
|
||||||
console log
|
|
||||||
|
|
||||||
# depends on vnc server
|
|
||||||
start on started Xvnc
|
|
||||||
stop on stopping Xvnc
|
|
||||||
|
|
||||||
kill timeout 60
|
|
||||||
|
|
||||||
exec nice brozzler-worker \
|
|
||||||
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
|
||||||
--max-browsers=4 \
|
|
||||||
--verbose \
|
|
||||||
--warcprox-auto
|
|
10
ansible/roles/brozzler-worker/templates/vnc-websock-run.j2
Normal file
10
ansible/roles/brozzler-worker/templates/vnc-websock-run.j2
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
logfile=/var/log/vnc-websock.log
|
||||||
|
touch $logfile
|
||||||
|
chown {{user}} $logfile
|
||||||
|
|
||||||
|
source /opt/websockify-ve3/bin/activate
|
||||||
|
|
||||||
|
exec nice setuidgid {{user}} websockify 0.0.0.0:8901 localhost:5901 >> $logfile 2>&1
|
||||||
|
|
@ -1,15 +0,0 @@
|
|||||||
description "vnc-websock"
|
|
||||||
|
|
||||||
start on runlevel [2345]
|
|
||||||
stop on runlevel [!2345]
|
|
||||||
|
|
||||||
setuid {{user}}
|
|
||||||
|
|
||||||
console log
|
|
||||||
|
|
||||||
env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages
|
|
||||||
env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin
|
|
||||||
|
|
||||||
# port 8901 is hard-coded in brozzler/dashboard/static/partials/workers.html
|
|
||||||
exec nice websockify 0.0.0.0:8901 localhost:5901
|
|
||||||
|
|
@ -1,44 +1,74 @@
|
|||||||
---
|
---
|
||||||
# get latest pip (had problems with version from apt-get, specifically
|
- apt:
|
||||||
# "pip install pyopenssl" did not install the dependency "cryptography")
|
name:
|
||||||
# http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
|
|
||||||
- name: install setuptools for python 2 and 3
|
|
||||||
become: true
|
|
||||||
apt: name={{item}} state=present
|
|
||||||
with_items:
|
|
||||||
- python-setuptools
|
|
||||||
- python3-setuptools
|
- python3-setuptools
|
||||||
- name: download pip-9.0.1.tar.gz
|
- python3-pip
|
||||||
get_url:
|
- python3-virtualenv
|
||||||
url: https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
|
- daemontools
|
||||||
dest: /tmp
|
- daemontools-run
|
||||||
checksum: sha1:57ff41e99cb01b6a1c2b0999161589b726f0ec8b
|
state: present
|
||||||
- name: extract pip-9.0.1.tar.gz
|
update_cache: yes
|
||||||
unarchive: src=/tmp/pip-9.0.1.tar.gz dest=/tmp copy=no
|
cache_valid_time: 86400 # one day
|
||||||
|
become: true
|
||||||
|
|
||||||
|
# # get recent virtualenv, which bundles a recent pip
|
||||||
|
# - find:
|
||||||
|
# paths:
|
||||||
|
# - /usr/local/lib/python3.4/dist-packages
|
||||||
|
# - /usr/local/lib/python3.5/dist-packages
|
||||||
|
# recurse: true
|
||||||
|
# patterns: virtualenv.py
|
||||||
|
# contains: '__version__ = "16.4.3"'
|
||||||
|
# register: virtualenv_py_16_4_3
|
||||||
|
#
|
||||||
|
# - command: mktemp -d
|
||||||
|
# register: mktempd_out
|
||||||
|
# when: virtualenv_py_16_4_3.matched == 0
|
||||||
|
#
|
||||||
|
# - name: download virtualenv-16.4.3
|
||||||
|
# get_url:
|
||||||
|
# url: https://files.pythonhosted.org/packages/37/db/89d6b043b22052109da35416abc3c397655e4bd3cff031446ba02b9654fa/virtualenv-16.4.3.tar.gz
|
||||||
|
# dest: '{{mktempd_out.stdout}}'
|
||||||
|
# checksum: sha256:984d7e607b0a5d1329425dd8845bd971b957424b5ba664729fab51ab8c11bc39
|
||||||
|
# when: virtualenv_py_16_4_3.matched == 0
|
||||||
|
#
|
||||||
|
# - name: extract virtualenv-16.4.3.tar.gz
|
||||||
|
# unarchive:
|
||||||
|
# src: '{{mktempd_out.stdout}}/virtualenv-16.4.3.tar.gz'
|
||||||
|
# dest: '{{mktempd_out.stdout}}'
|
||||||
|
# copy: no
|
||||||
|
# when: virtualenv_py_16_4_3.matched == 0
|
||||||
|
#
|
||||||
|
# - name: run "python3 setup.py install" in {{mktempd_out.stdout}}/virtualenv-16.4.3
|
||||||
|
# become: true
|
||||||
|
# command: python3 setup.py install
|
||||||
|
# args:
|
||||||
|
# chdir: '{{mktempd_out.stdout}}/virtualenv-16.4.3'
|
||||||
|
# when: virtualenv_py_16_4_3.matched == 0
|
||||||
|
#
|
||||||
|
# - file:
|
||||||
|
# path: '{{mktempd_out.stdout}}'
|
||||||
|
# state: absent
|
||||||
|
# become: true
|
||||||
|
# when: virtualenv_py_16_4_3.matched == 0
|
||||||
|
|
||||||
# this clause is a workaround for travis-ci, which only wants to install in /usr
|
# this clause is a workaround for travis-ci, which only wants to install in /usr
|
||||||
# see https://travis-ci.org/internetarchive/brozzler/builds/174338601
|
# see https://travis-ci.org/internetarchive/brozzler/builds/174338601
|
||||||
# but it complains that /usr/lib/python3.4/site-packages doesn't exist
|
# but it complains that /usr/lib/python3.5/site-packages doesn't exist
|
||||||
# see https://travis-ci.org/internetarchive/brozzler/builds/174094831
|
# see https://travis-ci.org/internetarchive/brozzler/builds/174094831
|
||||||
- file: path={{item}} state=directory
|
- file:
|
||||||
|
path: '{{item}}'
|
||||||
|
state: directory
|
||||||
with_items:
|
with_items:
|
||||||
- /usr/lib/python3.4/site-packages
|
- /usr/lib/python3.5/site-packages
|
||||||
- /usr/lib/python3.4/dist-packages
|
- /usr/lib/python3.5/dist-packages
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
- name: run "python3 setup.py install" in /tmp/pip-9.0.1
|
|
||||||
command: python3 setup.py install
|
|
||||||
chdir=/tmp/pip-9.0.1
|
|
||||||
creates=/usr/local/lib/python3.4/dist-packages/pip-9.0.1-py3.4.egg/pip/__init__.py
|
|
||||||
become: true
|
|
||||||
- name: run "pip install virtualenv"
|
|
||||||
command: pip install virtualenv
|
|
||||||
creates=/usr/local/lib/python3.4/dist-packages/virtualenv.py
|
|
||||||
become: true
|
|
||||||
- command: id {{user}}
|
- command: id {{user}}
|
||||||
register: id_user
|
register: id_user
|
||||||
ignore_errors: true
|
ignore_errors: true
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: ensure service user {{user}} exists
|
- name: ensure service user {{user}} exists
|
||||||
user: name={{user}} system=yes createhome=no home=/nonexistent
|
user: name={{user}} system=yes createhome=no home=/nonexistent
|
||||||
shell=/usr/sbin/nologin
|
shell=/usr/sbin/nologin
|
||||||
|
@ -1,5 +1,9 @@
|
|||||||
---
|
---
|
||||||
- name: restart pywb
|
- name: restart pywb
|
||||||
service: name=pywb state=restarted
|
svc:
|
||||||
|
name: pywb
|
||||||
|
state: restarted
|
||||||
|
service_dir: /etc/service
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
|
|
||||||
|
@ -1,36 +1,52 @@
|
|||||||
---
|
---
|
||||||
- name: mkdir {{venv_root}}/pywb-ve34
|
- name: mkdir {{venv_root}}/pywb-ve3
|
||||||
file: path={{venv_root}}/pywb-ve34 state=directory
|
file: path={{venv_root}}/pywb-ve3 state=directory
|
||||||
owner={{user}}
|
owner={{user}}
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
- name: install pywb in virtualenv
|
- name: install pywb in virtualenv
|
||||||
pip: name=pywb
|
pip:
|
||||||
version=0.33.2
|
name: pywb
|
||||||
virtualenv={{venv_root}}/pywb-ve34
|
version: 0.33.2
|
||||||
virtualenv_python=python3.4
|
virtualenv: '{{venv_root}}/pywb-ve3'
|
||||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
virtualenv_python: python3
|
||||||
|
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
|
||||||
|
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
||||||
become: true
|
become: true
|
||||||
become_user: '{{user}}'
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart pywb
|
- restart pywb
|
||||||
|
|
||||||
- name: install brozzler in pywb virtualenv
|
- name: install brozzler in pywb virtualenv
|
||||||
pip: name='{{brozzler_pip_name}}'
|
pip:
|
||||||
virtualenv={{venv_root}}/pywb-ve34
|
name: '{{brozzler_pip_name}}'
|
||||||
virtualenv_python=python3.4
|
virtualenv: '{{venv_root}}/pywb-ve3'
|
||||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
virtualenv_python: python3
|
||||||
|
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
|
||||||
|
extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
||||||
become: true
|
become: true
|
||||||
become_user: '{{user}}'
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart pywb
|
- restart pywb
|
||||||
|
|
||||||
- name: pywb config file /etc/pywb.yml
|
- name: pywb config file /etc/pywb.yml
|
||||||
template: src=templates/pywb.yml.j2
|
template: src=templates/pywb.yml.j2
|
||||||
dest=/etc/pywb.yml
|
dest=/etc/pywb.yml
|
||||||
become: true
|
become: true
|
||||||
notify:
|
notify:
|
||||||
- restart pywb
|
- restart pywb
|
||||||
- name: upstart config file /etc/init/pywb.conf
|
|
||||||
template: src=templates/pywb.conf.j2
|
- name: mkdir /etc/service/pywb
|
||||||
dest=/etc/init/pywb.conf
|
file:
|
||||||
|
path: /etc/service/pywb
|
||||||
|
state: directory
|
||||||
become: true
|
become: true
|
||||||
|
|
||||||
|
- name: install /etc/service/pywb/run
|
||||||
|
template:
|
||||||
|
src: templates/pywb-run.j2
|
||||||
|
dest: /etc/service/pywb/run
|
||||||
|
mode: 0755
|
||||||
notify:
|
notify:
|
||||||
- restart pywb
|
- restart pywb
|
||||||
|
become: true
|
||||||
|
10
ansible/roles/pywb/templates/pywb-run.j2
Normal file
10
ansible/roles/pywb/templates/pywb-run.j2
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
logfile=/var/log/pywb.log
|
||||||
|
touch $logfile
|
||||||
|
chown {{user}} $logfile
|
||||||
|
|
||||||
|
exec nice setuidgid {{user}} env PYWB_CONFIG_FILE=/etc/pywb.yml \
|
||||||
|
{{venv_root}}/pywb-ve3/bin/python {{venv_root}}/pywb-ve3/bin/brozzler-wayback \
|
||||||
|
>> $logfile 2>&1
|
||||||
|
|
@ -1,14 +0,0 @@
|
|||||||
description "pywb"
|
|
||||||
|
|
||||||
start on runlevel [2345]
|
|
||||||
stop on runlevel [!2345]
|
|
||||||
|
|
||||||
env PYTHONPATH={{venv_root}}/pywb-ve34/lib/python3.4/site-packages
|
|
||||||
env PATH={{venv_root}}/pywb-ve34/bin:/usr/bin:/bin
|
|
||||||
env PYWB_CONFIG_FILE=/etc/pywb.yml
|
|
||||||
|
|
||||||
setuid {{user}}
|
|
||||||
|
|
||||||
console log
|
|
||||||
|
|
||||||
exec nice brozzler-wayback
|
|
@ -3,8 +3,9 @@
|
|||||||
apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
|
apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
|
||||||
become: true
|
become: true
|
||||||
- name: ensure rethinkdb repo is in apt sources.list
|
- name: ensure rethinkdb repo is in apt sources.list
|
||||||
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
|
apt_repository:
|
||||||
state=present
|
repo: 'deb http://download.rethinkdb.com/apt {{ansible_lsb.codename|lower}} main'
|
||||||
|
state: present
|
||||||
become: true
|
become: true
|
||||||
- apt: update_cache=yes
|
- apt: update_cache=yes
|
||||||
become: true
|
become: true
|
||||||
|
@ -1,4 +1,7 @@
|
|||||||
---
|
---
|
||||||
- name: restart warcprox
|
- name: restart warcprox
|
||||||
service: name=warcprox state=restarted
|
svc:
|
||||||
|
name: warcprox
|
||||||
|
state: restarted
|
||||||
|
service_dir: /etc/service
|
||||||
become: true
|
become: true
|
||||||
|
@ -4,26 +4,37 @@
|
|||||||
apt: name={{item}} state=present
|
apt: name={{item}} state=present
|
||||||
with_items:
|
with_items:
|
||||||
- gcc
|
- gcc
|
||||||
- python3.4
|
- python3-dev
|
||||||
- libpython3.4-dev
|
|
||||||
- libffi-dev
|
- libffi-dev
|
||||||
- libssl-dev
|
- libssl-dev
|
||||||
- tor
|
- tor
|
||||||
- git
|
- git
|
||||||
- name: mkdir {{venv_root}}/warcprox-ve34
|
- name: mkdir {{venv_root}}/warcprox-ve3
|
||||||
become: true
|
become: true
|
||||||
file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}}
|
file: path={{venv_root}}/warcprox-ve3 state=directory owner={{user}}
|
||||||
- name: install warcprox in virtualenv
|
- name: install warcprox in virtualenv
|
||||||
pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox
|
pip:
|
||||||
virtualenv={{venv_root}}/warcprox-ve34
|
name: git+https://github.com/internetarchive/warcprox.git#egg=warcprox
|
||||||
virtualenv_python=python3.4
|
virtualenv: '{{venv_root}}/warcprox-ve3'
|
||||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
virtualenv_python: python3
|
||||||
|
extra_args: --no-input --upgrade --pre --cache-dir=/tmp/pip-cache
|
||||||
|
virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
|
||||||
become: true
|
become: true
|
||||||
become_user: '{{user}}'
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart warcprox
|
- restart warcprox
|
||||||
- name: install upstart config /etc/init/warcprox.conf
|
|
||||||
|
- name: mkdir /etc/service/warcprox
|
||||||
|
file:
|
||||||
|
path: /etc/service/warcprox
|
||||||
|
state: directory
|
||||||
become: true
|
become: true
|
||||||
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
|
|
||||||
|
- name: install /etc/service/warcprox/run
|
||||||
|
template:
|
||||||
|
src: templates/run.j2
|
||||||
|
dest: /etc/service/warcprox/run
|
||||||
|
mode: 0755
|
||||||
notify:
|
notify:
|
||||||
- restart warcprox
|
- restart warcprox
|
||||||
|
become: true
|
||||||
|
@ -1,19 +1,16 @@
|
|||||||
description "warcprox"
|
#!/bin/bash
|
||||||
|
|
||||||
start on runlevel [2345]
|
logfile=/var/log/warcprox.log
|
||||||
stop on runlevel [!2345]
|
touch $logfile
|
||||||
|
chown {{user}} $logfile
|
||||||
|
|
||||||
env PYTHONPATH={{venv_root}}/warcprox-ve34/lib/python3.4/site-packages
|
ulimit -n 4096
|
||||||
env PATH={{venv_root}}/warcprox-ve34/bin:/usr/bin:/bin
|
|
||||||
|
|
||||||
# by default warcprox creates some files/dirs relative to cwd
|
cd {{work_dir}}
|
||||||
chdir {{work_dir}}
|
|
||||||
setuid {{user}}
|
|
||||||
|
|
||||||
console log
|
source {{venv_root}}/warcprox-ve3/bin/activate
|
||||||
|
|
||||||
# --profile
|
exec nice -n5 setuidgid {{user}} env LANG=en_US.UTF-8 LC_COLLATE=C warcprox \
|
||||||
exec nice warcprox \
|
|
||||||
--address=0.0.0.0 \
|
--address=0.0.0.0 \
|
||||||
--dir={{warcs_dir}} \
|
--dir={{warcs_dir}} \
|
||||||
--base32 \
|
--base32 \
|
||||||
@ -22,4 +19,6 @@ exec nice warcprox \
|
|||||||
--onion-tor-socks-proxy=localhost:9050 \
|
--onion-tor-socks-proxy=localhost:9050 \
|
||||||
--rethinkdb-services-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/services \
|
--rethinkdb-services-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/services \
|
||||||
--rethinkdb-stats-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/stats \
|
--rethinkdb-stats-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/stats \
|
||||||
--rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures
|
--rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures \
|
||||||
|
>> $logfile 2>&1
|
||||||
|
|
@ -159,7 +159,7 @@ class ThreadExceptionGate:
|
|||||||
def queue_exception(self, e):
|
def queue_exception(self, e):
|
||||||
with self.lock:
|
with self.lock:
|
||||||
if self.pending_exception:
|
if self.pending_exception:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'%r already pending for thread %r, discarding %r',
|
'%r already pending for thread %r, discarding %r',
|
||||||
self.pending_exception, self.thread, e)
|
self.pending_exception, self.thread, e)
|
||||||
else:
|
else:
|
||||||
|
@ -223,7 +223,7 @@ class Chrome:
|
|||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if time.time() - self._last_warning > 30:
|
if time.time() - self._last_warning > 30:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'problem with %s (will keep trying until timeout '
|
'problem with %s (will keep trying until timeout '
|
||||||
'of %d seconds): %s', json_url, timeout_sec, e)
|
'of %d seconds): %s', json_url, timeout_sec, e)
|
||||||
self._last_warning = time.time()
|
self._last_warning = time.time()
|
||||||
@ -294,7 +294,7 @@ class Chrome:
|
|||||||
'chrome pid %s exited normally',
|
'chrome pid %s exited normally',
|
||||||
self.chrome_process.pid)
|
self.chrome_process.pid)
|
||||||
else:
|
else:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'chrome pid %s exited with nonzero status %s',
|
'chrome pid %s exited with nonzero status %s',
|
||||||
self.chrome_process.pid, status)
|
self.chrome_process.pid, status)
|
||||||
|
|
||||||
@ -305,13 +305,13 @@ class Chrome:
|
|||||||
return
|
return
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'chrome pid %s still alive %.1f seconds after sending '
|
'chrome pid %s still alive %.1f seconds after sending '
|
||||||
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
|
'SIGTERM, sending SIGKILL', self.chrome_process.pid,
|
||||||
time.time() - t0)
|
time.time() - t0)
|
||||||
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
os.killpg(self.chrome_process.pid, signal.SIGKILL)
|
||||||
status = self.chrome_process.wait()
|
status = self.chrome_process.wait()
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'chrome pid %s reaped (status=%s) after killing with '
|
'chrome pid %s reaped (status=%s) after killing with '
|
||||||
'SIGKILL', self.chrome_process.pid, status)
|
'SIGKILL', self.chrome_process.pid, status)
|
||||||
|
|
||||||
|
@ -627,7 +627,7 @@ def brozzler_purge(argv=None):
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
if job.status == 'ACTIVE':
|
if job.status == 'ACTIVE':
|
||||||
if args.force:
|
if args.force:
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'job %s has status ACTIVE, purging anyway because '
|
'job %s has status ACTIVE, purging anyway because '
|
||||||
'--force was supplied', job_id)
|
'--force was supplied', job_id)
|
||||||
else:
|
else:
|
||||||
@ -644,7 +644,7 @@ def brozzler_purge(argv=None):
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
if site.status == 'ACTIVE':
|
if site.status == 'ACTIVE':
|
||||||
if args.force:
|
if args.force:
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'site %s has status ACTIVE, purging anyway because '
|
'site %s has status ACTIVE, purging anyway because '
|
||||||
'--force was supplied', site_id)
|
'--force was supplied', site_id)
|
||||||
else:
|
else:
|
||||||
@ -712,7 +712,7 @@ def brozzler_list_captures(argv=None):
|
|||||||
|
|
||||||
if args.url_or_sha1[:5] == 'sha1:':
|
if args.url_or_sha1[:5] == 'sha1:':
|
||||||
if args.prefix:
|
if args.prefix:
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'ignoring supplied --prefix option which does not apply '
|
'ignoring supplied --prefix option which does not apply '
|
||||||
'to lookup by sha1')
|
'to lookup by sha1')
|
||||||
# assumes it's already base32 (XXX could detect if hex and convert)
|
# assumes it's already base32 (XXX could detect if hex and convert)
|
||||||
|
@ -260,7 +260,7 @@ class BrozzlerEasyController:
|
|||||||
state_strs.append(str(th))
|
state_strs.append(str(th))
|
||||||
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
stack = traceback.format_stack(sys._current_frames()[th.ident])
|
||||||
state_strs.append(''.join(stack))
|
state_strs.append(''.join(stack))
|
||||||
logging.warn('dumping state (caught signal {})\n{}'.format(
|
logging.warning('dumping state (caught signal {})\n{}'.format(
|
||||||
signum, '\n'.join(state_strs)))
|
signum, '\n'.join(state_strs)))
|
||||||
|
|
||||||
def main(argv=None):
|
def main(argv=None):
|
||||||
|
@ -138,7 +138,7 @@ class RethinkDbFrontier:
|
|||||||
sites = []
|
sites = []
|
||||||
for i in range(result["replaced"]):
|
for i in range(result["replaced"]):
|
||||||
if result["changes"][i]["old_val"]["claimed"]:
|
if result["changes"][i]["old_val"]["claimed"]:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"re-claimed site that was still marked 'claimed' "
|
"re-claimed site that was still marked 'claimed' "
|
||||||
"because it was last claimed a long time ago "
|
"because it was last claimed a long time ago "
|
||||||
"at %s, and presumably some error stopped it from "
|
"at %s, and presumably some error stopped it from "
|
||||||
@ -225,7 +225,7 @@ class RethinkDbFrontier:
|
|||||||
if not job:
|
if not job:
|
||||||
return False
|
return False
|
||||||
if job.status.startswith("FINISH"):
|
if job.status.startswith("FINISH"):
|
||||||
self.logger.warn("%s is already %s", job, job.status)
|
self.logger.warning("%s is already %s", job, job.status)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
results = self.rr.table("sites").get_all(job_id, index="job_id").run()
|
results = self.rr.table("sites").get_all(job_id, index="job_id").run()
|
||||||
@ -415,7 +415,7 @@ class RethinkDbFrontier:
|
|||||||
assert isinstance(e, brozzler.ReachedLimit)
|
assert isinstance(e, brozzler.ReachedLimit)
|
||||||
if (site.reached_limit
|
if (site.reached_limit
|
||||||
and site.reached_limit != e.warcprox_meta["reached-limit"]):
|
and site.reached_limit != e.warcprox_meta["reached-limit"]):
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"reached limit %s but site had already reached limit %s",
|
"reached limit %s but site had already reached limit %s",
|
||||||
e.warcprox_meta["reached-limit"], self.reached_limit)
|
e.warcprox_meta["reached-limit"], self.reached_limit)
|
||||||
else:
|
else:
|
||||||
@ -434,7 +434,7 @@ class RethinkDbFrontier:
|
|||||||
index="priority_by_site").filter({"hops_from_seed":0}).run()
|
index="priority_by_site").filter({"hops_from_seed":0}).run()
|
||||||
pages = list(results)
|
pages = list(results)
|
||||||
if len(pages) > 1:
|
if len(pages) > 1:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
"more than one seed page for site_id %s ?", site_id)
|
"more than one seed page for site_id %s ?", site_id)
|
||||||
if len(pages) < 1:
|
if len(pages) < 1:
|
||||||
return None
|
return None
|
||||||
|
@ -106,7 +106,7 @@ def is_permitted_by_robots(site, url, proxy=None):
|
|||||||
# reppy has wrapped an exception that we want to bubble up
|
# reppy has wrapped an exception that we want to bubble up
|
||||||
raise brozzler.ProxyError(e)
|
raise brozzler.ProxyError(e)
|
||||||
else:
|
else:
|
||||||
logging.warn(
|
logging.warning(
|
||||||
"returning true (permitted) after problem fetching "
|
"returning true (permitted) after problem fetching "
|
||||||
"robots.txt for %r: %r", url, e)
|
"robots.txt for %r: %r", url, e)
|
||||||
return True
|
return True
|
||||||
|
@ -147,13 +147,13 @@ class BrozzlerWorker:
|
|||||||
try:
|
try:
|
||||||
with urllib.request.urlopen(request, timeout=600) as response:
|
with urllib.request.urlopen(request, timeout=600) as response:
|
||||||
if response.getcode() != 204:
|
if response.getcode() != 204:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'got "%s %s" response on warcprox '
|
'got "%s %s" response on warcprox '
|
||||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||||
response.getcode(), response.reason)
|
response.getcode(), response.reason)
|
||||||
return request, response
|
return request, response
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'got "%s %s" response on warcprox '
|
'got "%s %s" response on warcprox '
|
||||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||||
e.getcode(), e.info())
|
e.getcode(), e.info())
|
||||||
@ -370,7 +370,7 @@ class BrozzlerWorker:
|
|||||||
if (page.needs_robots_check and
|
if (page.needs_robots_check and
|
||||||
not brozzler.is_permitted_by_robots(
|
not brozzler.is_permitted_by_robots(
|
||||||
site, page.url, self._proxy_for(site))):
|
site, page.url, self._proxy_for(site))):
|
||||||
logging.warn("page %s is blocked by robots.txt", page.url)
|
logging.warning("page %s is blocked by robots.txt", page.url)
|
||||||
page.blocked_by_robots = True
|
page.blocked_by_robots = True
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
else:
|
else:
|
||||||
@ -544,7 +544,7 @@ class BrozzlerWorker:
|
|||||||
def start(self):
|
def start(self):
|
||||||
with self._start_stop_lock:
|
with self._start_stop_lock:
|
||||||
if self._thread:
|
if self._thread:
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'ignoring start request because self._thread is '
|
'ignoring start request because self._thread is '
|
||||||
'not None')
|
'not None')
|
||||||
return
|
return
|
||||||
|
@ -48,7 +48,7 @@ _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_rea
|
|||||||
def _webpage_read_content(self, *args, **kwargs):
|
def _webpage_read_content(self, *args, **kwargs):
|
||||||
content = _orig_webpage_read_content(self, *args, **kwargs)
|
content = _orig_webpage_read_content(self, *args, **kwargs)
|
||||||
if len(content) > 20000000:
|
if len(content) > 20000000:
|
||||||
logging.warn(
|
logging.warning(
|
||||||
'bypassing youtube-dl extraction because content is '
|
'bypassing youtube-dl extraction because content is '
|
||||||
'too large (%s characters)', len(content))
|
'too large (%s characters)', len(content))
|
||||||
return ''
|
return ''
|
||||||
@ -185,7 +185,7 @@ def _build_youtube_dl(worker, destdir, site):
|
|||||||
mimetype = magic.from_file(ctx['filename'], mime=True)
|
mimetype = magic.from_file(ctx['filename'], mime=True)
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
mimetype = 'video/%s' % info_dict['ext']
|
mimetype = 'video/%s' % info_dict['ext']
|
||||||
self.logger.warn(
|
self.logger.warning(
|
||||||
'guessing mimetype %s because %r', mimetype, e)
|
'guessing mimetype %s because %r', mimetype, e)
|
||||||
|
|
||||||
url = 'youtube-dl:%05d:%s' % (
|
url = 'youtube-dl:%05d:%s' % (
|
||||||
|
@ -34,16 +34,41 @@ import http.server
|
|||||||
import logging
|
import logging
|
||||||
import warcprox
|
import warcprox
|
||||||
|
|
||||||
|
# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
|
||||||
|
def _local_address():
|
||||||
|
import socket
|
||||||
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||||
|
try:
|
||||||
|
s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
|
||||||
|
return s.getsockname()[0]
|
||||||
|
except:
|
||||||
|
return '127.0.0.1'
|
||||||
|
finally:
|
||||||
|
s.close()
|
||||||
|
|
||||||
|
local_address = _local_address()
|
||||||
|
|
||||||
def start_service(service):
|
def start_service(service):
|
||||||
subprocess.check_call(['sudo', 'service', service, 'start'])
|
subprocess.check_call(['sudo', 'svc', '-u', '/etc/service/' + service])
|
||||||
|
|
||||||
def stop_service(service):
|
def stop_service(service):
|
||||||
subprocess.check_call(['sudo', 'service', service, 'stop'])
|
subprocess.check_call(['sudo', 'svc', '-d', '/etc/service/' + service])
|
||||||
|
while True:
|
||||||
|
status = subprocess.check_output(
|
||||||
|
['sudo', 'svstat', '/etc/service/' + service])
|
||||||
|
if b' down ' in status:
|
||||||
|
break
|
||||||
|
time.sleep(0.5)
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
class RequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||||
|
def do_POST(self):
|
||||||
|
logging.info('\n%s\n%s', self.requestline, self.headers)
|
||||||
|
self.do_GET()
|
||||||
|
|
||||||
def do_GET(self):
|
def do_GET(self):
|
||||||
|
logging.info('\n%s\n%s', self.requestline, self.headers)
|
||||||
if self.path == '/site5/redirect/':
|
if self.path == '/site5/redirect/':
|
||||||
self.send_response(303, 'See other')
|
self.send_response(303, 'See other')
|
||||||
self.send_header('Connection', 'close')
|
self.send_header('Connection', 'close')
|
||||||
@ -82,7 +107,7 @@ def httpd(request):
|
|||||||
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
# SimpleHTTPRequestHandler always uses CWD so we have to chdir
|
||||||
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
|
||||||
|
|
||||||
httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
|
httpd = http.server.HTTPServer((local_address, 0), RequestHandler)
|
||||||
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
|
||||||
httpd_thread.start()
|
httpd_thread.start()
|
||||||
|
|
||||||
@ -94,6 +119,9 @@ def httpd(request):
|
|||||||
|
|
||||||
return httpd
|
return httpd
|
||||||
|
|
||||||
|
def make_url(httpd, rel_url):
|
||||||
|
return 'http://%s:%s%s' % (local_address, httpd.server_port, rel_url)
|
||||||
|
|
||||||
def test_httpd(httpd):
|
def test_httpd(httpd):
|
||||||
'''
|
'''
|
||||||
Tests that our http server is working as expected, and that two fetches
|
Tests that our http server is working as expected, and that two fetches
|
||||||
@ -101,7 +129,7 @@ def test_httpd(httpd):
|
|||||||
deduplication.
|
deduplication.
|
||||||
'''
|
'''
|
||||||
payload1 = content2 = None
|
payload1 = content2 = None
|
||||||
url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
url = make_url(httpd, '/site1/file1.txt')
|
||||||
with urllib.request.urlopen(url) as response:
|
with urllib.request.urlopen(url) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
payload1 = response.read()
|
payload1 = response.read()
|
||||||
@ -140,13 +168,13 @@ def test_brozzle_site(httpd):
|
|||||||
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
'seed': make_url(httpd, '/site1/'),
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
|
||||||
# the two pages we expect to be crawled
|
# the two pages we expect to be crawled
|
||||||
page1 = 'http://localhost:%s/site1/' % httpd.server_port
|
page1 = make_url(httpd, '/site1/')
|
||||||
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
page2 = make_url(httpd, '/site1/file1.txt')
|
||||||
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
|
robots = make_url(httpd, '/robots.txt')
|
||||||
|
|
||||||
# so we can examine rethinkdb before it does anything
|
# so we can examine rethinkdb before it does anything
|
||||||
try:
|
try:
|
||||||
@ -171,8 +199,7 @@ def test_brozzle_site(httpd):
|
|||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 2
|
assert len(pages) == 2
|
||||||
assert {page.url for page in pages} == {
|
assert {page.url for page in pages} == {
|
||||||
'http://localhost:%s/site1/' % httpd.server_port,
|
make_url(httpd, '/site1/'), make_url(httpd, '/site1/file1.txt')}
|
||||||
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
|
|
||||||
|
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
@ -255,8 +282,8 @@ def test_proxy_non_warcprox(httpd):
|
|||||||
start_service('brozzler-worker')
|
start_service('brozzler-worker')
|
||||||
assert len(proxy.requests) <= 15
|
assert len(proxy.requests) <= 15
|
||||||
assert proxy.requests.count('GET /status') == 1
|
assert proxy.requests.count('GET /status') == 1
|
||||||
assert ('GET http://localhost:%s/site1/' % httpd.server_port) in proxy.requests
|
assert ('GET %s' % make_url(httpd, '/site1/')) in proxy.requests
|
||||||
assert ('GET http://localhost:%s/site1/file1.txt' % httpd.server_port) in proxy.requests
|
assert ('GET %s' % make_url(httpd, '/site1/file1.txt')) in proxy.requests
|
||||||
assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == []
|
assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == []
|
||||||
|
|
||||||
proxy.shutdown()
|
proxy.shutdown()
|
||||||
@ -292,14 +319,14 @@ def _test_proxy_setting(
|
|||||||
datetime.datetime.utcnow().isoformat())
|
datetime.datetime.utcnow().isoformat())
|
||||||
|
|
||||||
# the two pages we expect to be crawled
|
# the two pages we expect to be crawled
|
||||||
page1 = 'http://localhost:%s/site1/' % httpd.server_port
|
page1 = make_url(httpd, '/site1/')
|
||||||
page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
|
page2 = make_url(httpd, '/site1/file1.txt')
|
||||||
robots = 'http://localhost:%s/robots.txt' % httpd.server_port
|
robots = make_url(httpd, '/robots.txt')
|
||||||
|
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
service_registry = doublethink.ServiceRegistry(rr)
|
service_registry = doublethink.ServiceRegistry(rr)
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
'seed': make_url(httpd, '/site1/'),
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
assert site.id is None
|
assert site.id is None
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
@ -332,8 +359,8 @@ def _test_proxy_setting(
|
|||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 2
|
assert len(pages) == 2
|
||||||
assert {page.url for page in pages} == {
|
assert {page.url for page in pages} == {
|
||||||
'http://localhost:%s/site1/' % httpd.server_port,
|
make_url(httpd, '/site1/'),
|
||||||
'http://localhost:%s/site1/file1.txt' % httpd.server_port}
|
make_url(httpd, '/site1/file1.txt')}
|
||||||
|
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
@ -360,7 +387,7 @@ def test_obey_robots(httpd):
|
|||||||
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site1/' % httpd.server_port,
|
'seed': make_url(httpd, '/site1/'),
|
||||||
'user_agent': 'im a badbot', # robots.txt blocks badbot
|
'user_agent': 'im a badbot', # robots.txt blocks badbot
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
|
||||||
@ -390,12 +417,12 @@ def test_obey_robots(httpd):
|
|||||||
pages = list(frontier.site_pages(site.id))
|
pages = list(frontier.site_pages(site.id))
|
||||||
assert len(pages) == 1
|
assert len(pages) == 1
|
||||||
page = pages[0]
|
page = pages[0]
|
||||||
assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
|
assert page.url == make_url(httpd, '/site1/')
|
||||||
assert page.blocked_by_robots
|
assert page.blocked_by_robots
|
||||||
|
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
|
robots_url = make_url(httpd, '/robots.txt')
|
||||||
captures = list(rr.table('captures').filter({'test_id':test_id}).run())
|
captures = list(rr.table('captures').filter({'test_id':test_id}).run())
|
||||||
assert len(captures) == 1
|
assert len(captures) == 1
|
||||||
assert captures[0]['url'] == robots_url
|
assert captures[0]['url'] == robots_url
|
||||||
@ -412,7 +439,7 @@ def test_login(httpd):
|
|||||||
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site2/' % httpd.server_port,
|
'seed': make_url(httpd, '/site2/'),
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
|
||||||
'username': 'test_username', 'password': 'test_password'})
|
'username': 'test_username', 'password': 'test_password'})
|
||||||
|
|
||||||
@ -428,7 +455,7 @@ def test_login(httpd):
|
|||||||
|
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
|
robots_url = make_url(httpd, '/robots.txt')
|
||||||
captures = list(rr.table('captures').filter(
|
captures = list(rr.table('captures').filter(
|
||||||
{'test_id':test_id}).order_by('timestamp').run())
|
{'test_id':test_id}).order_by('timestamp').run())
|
||||||
meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
|
meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
|
||||||
@ -436,25 +463,25 @@ def test_login(httpd):
|
|||||||
# there are several forms in in htdocs/site2/login.html but only one
|
# there are several forms in in htdocs/site2/login.html but only one
|
||||||
# that brozzler's heuristic should match and try to submit, and it has
|
# that brozzler's heuristic should match and try to submit, and it has
|
||||||
# action='00', so we can check for that here
|
# action='00', so we can check for that here
|
||||||
assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url
|
assert ('POST %s' % make_url(httpd, '/site2/00')) in meth_url
|
||||||
|
|
||||||
# sanity check the rest of the crawl
|
# sanity check the rest of the crawl
|
||||||
assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
|
assert ('GET %s' % make_url(httpd, '/robots.txt')) in meth_url
|
||||||
assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
|
assert ('GET %s' % make_url(httpd, '/site2/')) in meth_url
|
||||||
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
|
assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/')) in meth_url
|
||||||
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
|
assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/')) in meth_url
|
||||||
assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
assert ('GET %s' % make_url(httpd, '/site2/login.html')) in meth_url
|
||||||
assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/login.html')) in meth_url
|
||||||
assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
|
assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/login.html')) in meth_url
|
||||||
|
|
||||||
def test_seed_redirect(httpd):
|
def test_seed_redirect(httpd):
|
||||||
test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
|
seed_url = make_url(httpd, '/site5/redirect/')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
|
'seed': make_url(httpd, '/site5/redirect/'),
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]}
|
assert site.scope == {'accepts': [{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}]}
|
||||||
|
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
brozzler.new_site(frontier, site)
|
brozzler.new_site(frontier, site)
|
||||||
@ -473,19 +500,19 @@ def test_seed_redirect(httpd):
|
|||||||
pages.sort(key=lambda page: page.hops_from_seed)
|
pages.sort(key=lambda page: page.hops_from_seed)
|
||||||
assert pages[0].hops_from_seed == 0
|
assert pages[0].hops_from_seed == 0
|
||||||
assert pages[0].url == seed_url
|
assert pages[0].url == seed_url
|
||||||
assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
|
assert pages[0].redirect_url == make_url(httpd, '/site5/destination/')
|
||||||
assert pages[1].hops_from_seed == 1
|
assert pages[1].hops_from_seed == 1
|
||||||
assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
|
assert pages[1].url == make_url(httpd, '/site5/destination/page2.html')
|
||||||
|
|
||||||
# check that scope has been updated properly
|
# check that scope has been updated properly
|
||||||
assert site.scope == {'accepts': [
|
assert site.scope == {'accepts': [
|
||||||
{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port},
|
{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)},
|
||||||
{'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}
|
{'ssurt': '%s//%s:http:/site5/destination/' % (local_address, httpd.server_port)}]}
|
||||||
|
|
||||||
def test_hashtags(httpd):
|
def test_hashtags(httpd):
|
||||||
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
seed_url = 'http://localhost:%s/site7/' % httpd.server_port
|
seed_url = make_url(httpd, '/site7/')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': seed_url,
|
'seed': seed_url,
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
@ -507,9 +534,9 @@ def test_hashtags(httpd):
|
|||||||
assert pages[0].url == seed_url
|
assert pages[0].url == seed_url
|
||||||
assert pages[0].hops_from_seed == 0
|
assert pages[0].hops_from_seed == 0
|
||||||
assert pages[0].brozzle_count == 1
|
assert pages[0].brozzle_count == 1
|
||||||
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
|
assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site7/foo.html')]
|
||||||
assert not pages[0].hashtags
|
assert not pages[0].hashtags
|
||||||
assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
|
assert pages[1].url == make_url(httpd, '/site7/foo.html')
|
||||||
assert pages[1].hops_from_seed == 1
|
assert pages[1].hops_from_seed == 1
|
||||||
assert pages[1].brozzle_count == 1
|
assert pages[1].brozzle_count == 1
|
||||||
assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
|
assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
|
||||||
@ -520,18 +547,18 @@ def test_hashtags(httpd):
|
|||||||
captures_by_url = {
|
captures_by_url = {
|
||||||
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
c['url']: c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
assert seed_url in captures_by_url
|
assert seed_url in captures_by_url
|
||||||
assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
assert make_url(httpd, '/site7/foo.html') in captures_by_url
|
||||||
assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
|
assert make_url(httpd, '/site7/whee.txt') in captures_by_url
|
||||||
assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
|
assert make_url(httpd, '/site7/boosh.txt') in captures_by_url
|
||||||
assert 'screenshot:%s' % seed_url in captures_by_url
|
assert 'screenshot:%s' % seed_url in captures_by_url
|
||||||
assert 'thumbnail:%s' % seed_url in captures_by_url
|
assert 'thumbnail:%s' % seed_url in captures_by_url
|
||||||
assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
assert 'screenshot:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
|
||||||
assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
|
assert 'thumbnail:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
|
||||||
|
|
||||||
def test_redirect_hashtags(httpd):
|
def test_redirect_hashtags(httpd):
|
||||||
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
seed_url = 'http://localhost:%s/site9/' % httpd.server_port
|
seed_url = make_url(httpd, '/site9/')
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': seed_url,
|
'seed': seed_url,
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
@ -553,9 +580,9 @@ def test_redirect_hashtags(httpd):
|
|||||||
assert pages[0].url == seed_url
|
assert pages[0].url == seed_url
|
||||||
assert pages[0].hops_from_seed == 0
|
assert pages[0].hops_from_seed == 0
|
||||||
assert pages[0].brozzle_count == 1
|
assert pages[0].brozzle_count == 1
|
||||||
assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port]
|
assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site9/redirect.html')]
|
||||||
assert not pages[0].hashtags
|
assert not pages[0].hashtags
|
||||||
assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port
|
assert pages[1].url == make_url(httpd, '/site9/redirect.html')
|
||||||
assert pages[1].hops_from_seed == 1
|
assert pages[1].hops_from_seed == 1
|
||||||
assert pages[1].brozzle_count == 1
|
assert pages[1].brozzle_count == 1
|
||||||
assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]
|
assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]
|
||||||
@ -563,7 +590,7 @@ def test_redirect_hashtags(httpd):
|
|||||||
time.sleep(2) # in case warcprox hasn't finished processing urls
|
time.sleep(2) # in case warcprox hasn't finished processing urls
|
||||||
# take a look at the captures table
|
# take a look at the captures table
|
||||||
captures = rr.table('captures').filter({'test_id':test_id}).run()
|
captures = rr.table('captures').filter({'test_id':test_id}).run()
|
||||||
redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET']
|
redirect_captures = [c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET']
|
||||||
assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
|
assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
|
||||||
|
|
||||||
# === expected captures ===
|
# === expected captures ===
|
||||||
@ -589,9 +616,9 @@ def test_stop_crawl(httpd):
|
|||||||
|
|
||||||
# create a new job with three sites that could be crawled forever
|
# create a new job with three sites that could be crawled forever
|
||||||
job_conf = {'seeds': [
|
job_conf = {'seeds': [
|
||||||
{'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port},
|
{'url': make_url(httpd, '/infinite/foo/')},
|
||||||
{'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port},
|
{'url': make_url(httpd, '/infinite/bar/')},
|
||||||
{'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]}
|
{'url': make_url(httpd, '/infinite/baz/')}]}
|
||||||
job = brozzler.new_job(frontier, job_conf)
|
job = brozzler.new_job(frontier, job_conf)
|
||||||
assert job.id
|
assert job.id
|
||||||
|
|
||||||
@ -675,7 +702,7 @@ def test_warcprox_outage_resiliency(httpd):
|
|||||||
# put together a site to crawl
|
# put together a site to crawl
|
||||||
test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
|
test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/infinite/' % httpd.server_port,
|
'seed': make_url(httpd, '/infinite/'),
|
||||||
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -684,7 +711,7 @@ def test_warcprox_outage_resiliency(httpd):
|
|||||||
try:
|
try:
|
||||||
stop_service('warcprox')
|
stop_service('warcprox')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.warn('problem stopping warcprox service: %s', e)
|
logging.warning('problem stopping warcprox service: %s', e)
|
||||||
|
|
||||||
# queue the site for brozzling
|
# queue the site for brozzling
|
||||||
brozzler.new_site(frontier, site)
|
brozzler.new_site(frontier, site)
|
||||||
@ -771,7 +798,7 @@ def test_time_limit(httpd):
|
|||||||
|
|
||||||
# create a new job with one seed that could be crawled forever
|
# create a new job with one seed that could be crawled forever
|
||||||
job_conf = {'seeds': [{
|
job_conf = {'seeds': [{
|
||||||
'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
|
'url': make_url(httpd, '/infinite/foo/'),
|
||||||
'time_limit': 20}]}
|
'time_limit': 20}]}
|
||||||
job = brozzler.new_job(frontier, job_conf)
|
job = brozzler.new_job(frontier, job_conf)
|
||||||
assert job.id
|
assert job.id
|
||||||
@ -801,7 +828,7 @@ def test_ydl_stitching(httpd):
|
|||||||
rr = doublethink.Rethinker('localhost', db='brozzler')
|
rr = doublethink.Rethinker('localhost', db='brozzler')
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
'seed': 'http://localhost:%s/site10/' % httpd.server_port,
|
'seed': make_url(httpd, '/site10/'),
|
||||||
'warcprox_meta': {
|
'warcprox_meta': {
|
||||||
'warc-prefix': 'test_ydl_stitching',
|
'warc-prefix': 'test_ydl_stitching',
|
||||||
'captures-table-extra-fields': {'test_id':test_id}}})
|
'captures-table-extra-fields': {'test_id':test_id}}})
|
||||||
@ -819,7 +846,7 @@ def test_ydl_stitching(httpd):
|
|||||||
assert len(pages) == 1
|
assert len(pages) == 1
|
||||||
page = pages[0]
|
page = pages[0]
|
||||||
assert len(page.videos) == 6
|
assert len(page.videos) == 6
|
||||||
stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port
|
stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/')
|
||||||
assert {
|
assert {
|
||||||
'blame': 'youtube-dl',
|
'blame': 'youtube-dl',
|
||||||
'content-length': 267900,
|
'content-length': 267900,
|
||||||
|
@ -24,27 +24,27 @@ the brozzler virtualenv.
|
|||||||
::
|
::
|
||||||
|
|
||||||
my-laptop$ vagrant ssh
|
my-laptop$ vagrant ssh
|
||||||
vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate
|
vagrant@brzl:~$ source /opt/brozzler-ve3/bin/activate
|
||||||
(brozzler-ve34)vagrant@brzl:~$
|
(brozzler-ve3)vagrant@brzl:~$
|
||||||
|
|
||||||
Then you can run brozzler-new-site:
|
Then you can run brozzler-new-site:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
|
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
|
||||||
|
|
||||||
|
|
||||||
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
(brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <<EOF
|
(brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
|
||||||
id: job1
|
id: job1
|
||||||
proxy: localhost:8000 # point at warcprox for archiving
|
proxy: localhost:8000 # point at warcprox for archiving
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.org/
|
- url: https://example.org/
|
||||||
EOF
|
EOF
|
||||||
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-job job1.yml
|
(brozzler-ve3)vagrant@brzl:~$ brozzler-new-job job1.yml
|
||||||
|
|
||||||
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
|
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
|
||||||
./logs (via vagrant folders syncing).
|
./logs (via vagrant folders syncing).
|
||||||
|
4
vagrant/Vagrantfile
vendored
4
vagrant/Vagrantfile
vendored
@ -1,8 +1,9 @@
|
|||||||
Vagrant.configure(2) do |config|
|
Vagrant.configure(2) do |config|
|
||||||
config.vm.box = "ubuntu/trusty64"
|
config.vm.box = "ubuntu/xenial64"
|
||||||
config.vm.define "10.9.9.9"
|
config.vm.define "10.9.9.9"
|
||||||
config.vm.hostname = "brzl"
|
config.vm.hostname = "brzl"
|
||||||
config.vm.network :private_network, ip: "10.9.9.9"
|
config.vm.network :private_network, ip: "10.9.9.9"
|
||||||
|
config.disksize.size = '50GB'
|
||||||
|
|
||||||
config.vm.synced_folder "..", "/brozzler"
|
config.vm.synced_folder "..", "/brozzler"
|
||||||
|
|
||||||
@ -14,6 +15,7 @@ Vagrant.configure(2) do |config|
|
|||||||
config.vm.provision "ansible" do |ansible|
|
config.vm.provision "ansible" do |ansible|
|
||||||
ansible.inventory_path = "../ansible/hosts-vagrant"
|
ansible.inventory_path = "../ansible/hosts-vagrant"
|
||||||
ansible.playbook = "../ansible/playbook.yml"
|
ansible.playbook = "../ansible/playbook.yml"
|
||||||
|
# ansible.verbose = "-vvv"
|
||||||
end
|
end
|
||||||
|
|
||||||
config.vm.provider 'virtualbox' do |v|
|
config.vm.provider 'virtualbox' do |v|
|
||||||
|
@ -10,12 +10,12 @@ cd $(dirname "${BASH_SOURCE[0]}")
|
|||||||
vagrant up
|
vagrant up
|
||||||
|
|
||||||
echo service status:
|
echo service status:
|
||||||
vagrant ssh -- 'status warcprox ;
|
vagrant ssh -- 'sudo svstat /etc/service/warcprox ;
|
||||||
status Xvnc ;
|
sudo svstat /etc/service/Xvnc ;
|
||||||
status brozzler-worker ;
|
sudo svstat /etc/service/brozzler-worker ;
|
||||||
status brozzler-dashboard ;
|
sudo svstat /etc/service/brozzler-dashboard ;
|
||||||
status vnc-websock'
|
sudo svstat /etc/service/vnc-websock'
|
||||||
echo
|
echo
|
||||||
|
|
||||||
vagrant ssh -- 'set -x ; source /opt/brozzler-ve34/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"'
|
vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest==4.3.0 && pip install --upgrade --pre "warcprox>=2.1b1.dev86"'
|
||||||
vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@"
|
vagrant ssh -- "source /opt/brozzler-ve3/bin/activate && DISPLAY=:1 py.test --tb=native -v /brozzler/tests $@"
|
||||||
|
@ -7,7 +7,7 @@ This is a standalone script with no dependencies other than python, and should
|
|||||||
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||||
so we can use the argparse library.
|
so we can use the argparse library.
|
||||||
|
|
||||||
Copyright (C) 2016 Internet Archive
|
Copyright (C) 2016-2019 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
@ -41,9 +41,8 @@ def main(argv=[]):
|
|||||||
subprocess.call([
|
subprocess.call([
|
||||||
'vagrant', 'ssh', '--',
|
'vagrant', 'ssh', '--',
|
||||||
'f=`mktemp` && cat > $f && '
|
'f=`mktemp` && cat > $f && '
|
||||||
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
'/home/vagrant/brozzler-ve3/bin/python '
|
||||||
'/home/vagrant/brozzler-ve34/bin/python '
|
'/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
|
||||||
'/home/vagrant/brozzler-ve34/bin/brozzler-new-job $f'],
|
|
||||||
stdin=f)
|
stdin=f)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -74,11 +74,8 @@ def main(argv=[]):
|
|||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.dirname(__file__))
|
||||||
|
|
||||||
cmd = (
|
cmd = (
|
||||||
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
'/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
|
||||||
'/home/vagrant/brozzler-ve34/bin/python '
|
'%s %s') % (' '.join(options), args.seed)
|
||||||
'/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
|
|
||||||
'--proxy=localhost:8000 %s %s') % (
|
|
||||||
' '.join(options), args.seed)
|
|
||||||
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
subprocess.call(['vagrant', 'ssh', '--', cmd])
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user