mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-09 15:02:28 -04:00
Merge branch 'encodingFixIdentity' (and lots new from master) into qa
This commit is contained in:
commit
d30cd52c47
50 changed files with 529 additions and 197 deletions
66
README.rst
66
README.rst
|
@ -136,6 +136,72 @@ To start the app, run
|
||||||
|
|
||||||
See ``brozzler-webconsole --help`` for configuration options.
|
See ``brozzler-webconsole --help`` for configuration options.
|
||||||
|
|
||||||
|
Headless Chromium
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
`Headless Chromium <https://chromium.googlesource.com/chromium/src/+/master/headless/README.md>`_
|
||||||
|
may optionally be used instead of Chromium or Chrome to run Brozzler without
|
||||||
|
a visisble browser window or X11 server. At the time of writing
|
||||||
|
``headless_shell`` is a separate Linux-only executable and must be compiled
|
||||||
|
from source. Beware that compiling Chromium requires 10 GB of disk space,
|
||||||
|
several GB of RAM and patience.
|
||||||
|
|
||||||
|
Start by installing the dependencies listed in Chromium's `Linux-specific build
|
||||||
|
instructions <https://chromium.googlesource.com/chromium/src/+/master/docs/linux_build_instructions.md>`_.
|
||||||
|
|
||||||
|
Next install the build tools and fetch the source code:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
mkdir -p ~/chromium
|
||||||
|
cd ~/chromium
|
||||||
|
git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git
|
||||||
|
export $PATH=$PWD/depot_tools:$PATH
|
||||||
|
fetch --no-history chromium --nosvn=True
|
||||||
|
|
||||||
|
Configure a headless release build (the debug builds are much larger):
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
cd src
|
||||||
|
mkdir -p out/release
|
||||||
|
echo 'import("//build/args/headless.gn")' > out/release/args.gn
|
||||||
|
echo 'is_debug = false' >> out/release/args.gn
|
||||||
|
gn gen out/release
|
||||||
|
|
||||||
|
Run the compile:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
ninja -C out/release headless_shell
|
||||||
|
|
||||||
|
This will produce an ``out/release/headless_shell`` executable. Unfortunately
|
||||||
|
this cannot be used with Brozzler as-is as the ``--window-size`` command-line
|
||||||
|
option expects a different syntax in Headless Chromium. As a workaround create
|
||||||
|
a wrapper shell script ``headless_chromium.sh`` which replaces the misbehaving
|
||||||
|
option:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
#!/bin/bash
|
||||||
|
exec ~/chromium/src/out/release/headless_shell "${@//--window-size=1100,900/--window-size=1100x900}"
|
||||||
|
|
||||||
|
Run brozzler passing the path to the wrapper script as the ``--chrome-exe``
|
||||||
|
option:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
chmod +x ~/bin/headless_chromium.sh
|
||||||
|
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
|
||||||
|
|
||||||
|
The Pepper Flash plugin ``libpepflashplayer.so`` from an official Google Chrome
|
||||||
|
release may be used with Headless Chromium by adding this option to the wrapper
|
||||||
|
script:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
--register-pepper-plugins=/opt/google/chrome/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash
|
||||||
|
|
||||||
License
|
License
|
||||||
-------
|
-------
|
||||||
|
|
||||||
|
|
23
ansible/hosts-vagrant
Normal file
23
ansible/hosts-vagrant
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
[all:vars]
|
||||||
|
warcs_dir=/vagrant/warcs
|
||||||
|
brozzler_pip_name='-e /brozzler'
|
||||||
|
user=vagrant
|
||||||
|
### possible values for a prod deployment
|
||||||
|
# brozzler_pip_name=brozzler # get it from pypi
|
||||||
|
# brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||||
|
|
||||||
|
[rethinkdb]
|
||||||
|
10.9.9.9
|
||||||
|
|
||||||
|
[warcprox]
|
||||||
|
work_dir=/vagrant
|
||||||
|
10.9.9.9
|
||||||
|
|
||||||
|
[brozzler-worker]
|
||||||
|
10.9.9.9
|
||||||
|
|
||||||
|
[brozzler-webconsole]
|
||||||
|
10.9.9.9
|
||||||
|
|
||||||
|
[pywb]
|
||||||
|
10.9.9.9
|
|
@ -1,9 +1,4 @@
|
||||||
---
|
---
|
||||||
- name: apply common configuration to all nodes
|
|
||||||
hosts: all
|
|
||||||
roles:
|
|
||||||
- common
|
|
||||||
|
|
||||||
- name: deploy rethinkdb
|
- name: deploy rethinkdb
|
||||||
hosts: rethinkdb
|
hosts: rethinkdb
|
||||||
roles:
|
roles:
|
||||||
|
@ -27,4 +22,4 @@
|
||||||
- name: deploy pywb
|
- name: deploy pywb
|
||||||
hosts: pywb
|
hosts: pywb
|
||||||
roles:
|
roles:
|
||||||
- pywb
|
- pywb
|
3
ansible/roles/brozzler-webconsole/meta/main.yml
Normal file
3
ansible/roles/brozzler-webconsole/meta/main.yml
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
dependencies:
|
||||||
|
- role: common
|
|
@ -1,9 +1,15 @@
|
||||||
---
|
---
|
||||||
|
- name: mkdir {{venv_root}}/brozzler-webconsole-ve34
|
||||||
|
file: path={{venv_root}}/brozzler-webconsole-ve34 state=directory
|
||||||
|
owner={{user}}
|
||||||
|
become: true
|
||||||
- name: install brozzler[webconsole] in virtualenv
|
- name: install brozzler[webconsole] in virtualenv
|
||||||
pip: name='-e /brozzler[webconsole]'
|
pip: name='{{brozzler_pip_name}}[webconsole]'
|
||||||
virtualenv=/home/vagrant/brozzler-webconsole-ve34
|
virtualenv={{venv_root}}/brozzler-webconsole-ve34
|
||||||
virtualenv_python=python3.4
|
virtualenv_python=python3.4
|
||||||
extra_args='--no-input --upgrade --pre'
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
become: true
|
||||||
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart brozzler-webconsole
|
- restart brozzler-webconsole
|
||||||
- name: install upstart config /etc/init/brozzler-webconsole.conf
|
- name: install upstart config /etc/init/brozzler-webconsole.conf
|
|
@ -3,16 +3,16 @@ description "brozzler-webconsole"
|
||||||
start on runlevel [2345]
|
start on runlevel [2345]
|
||||||
stop on runlevel [!2345]
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
env PYTHONPATH=/home/vagrant/brozzler-webconsole-ve34/lib/python3.4/site-packages
|
env PYTHONPATH={{venv_root}}/brozzler-webconsole-ve34/lib/python3.4/site-packages
|
||||||
env PATH=/home/vagrant/brozzler-webconsole-ve34/bin:/usr/bin:/bin
|
env PATH={{venv_root}}/brozzler-webconsole-ve34/bin:/usr/bin:/bin
|
||||||
env LC_ALL=C.UTF-8
|
env LC_ALL=C.UTF-8
|
||||||
|
|
||||||
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
|
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
|
||||||
env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
|
env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
|
||||||
env RETHINKDB_DB=brozzler
|
env RETHINKDB_DB=brozzler
|
||||||
|
|
||||||
setuid vagrant
|
setuid {{user}}
|
||||||
|
|
||||||
# console log
|
console log
|
||||||
|
|
||||||
exec gunicorn --bind=0.0.0.0:8881 brozzler.webconsole:app >>/vagrant/logs/brozzler-webconsole.log 2>&1
|
exec gunicorn --bind=0.0.0.0:8881 brozzler.webconsole:app
|
3
ansible/roles/brozzler-worker/meta/main.yml
Normal file
3
ansible/roles/brozzler-worker/meta/main.yml
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
dependencies:
|
||||||
|
- role: common
|
|
@ -3,60 +3,72 @@
|
||||||
apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
|
apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
|
||||||
state=present
|
state=present
|
||||||
become: true
|
become: true
|
||||||
|
- apt: update_cache=yes
|
||||||
|
become: true
|
||||||
- name: ensure required packages are installed
|
- name: ensure required packages are installed
|
||||||
become: true
|
become: true
|
||||||
apt: name={{item}} state=present
|
apt: name={{item}} state=present
|
||||||
with_items:
|
with_items:
|
||||||
- python-virtualenv
|
- python-virtualenv
|
||||||
- vnc4server
|
- vnc4server
|
||||||
- chromium-browser
|
- chromium-browser
|
||||||
- xfonts-base
|
- xfonts-base
|
||||||
- fonts-arphic-bkai00mp
|
- fonts-arphic-bkai00mp
|
||||||
- fonts-arphic-bsmi00lp
|
- fonts-arphic-bsmi00lp
|
||||||
- fonts-arphic-gbsn00lp
|
- fonts-arphic-gbsn00lp
|
||||||
- fonts-arphic-gkai00mp
|
- fonts-arphic-gkai00mp
|
||||||
- fonts-arphic-ukai
|
- fonts-arphic-ukai
|
||||||
- fonts-farsiweb
|
- fonts-farsiweb
|
||||||
- fonts-nafees
|
- fonts-nafees
|
||||||
- fonts-sil-abyssinica
|
- fonts-sil-abyssinica
|
||||||
- fonts-sil-ezra
|
- fonts-sil-ezra
|
||||||
- fonts-sil-padauk
|
- fonts-sil-padauk
|
||||||
- fonts-unfonts-extra
|
- fonts-unfonts-extra
|
||||||
- fonts-unfonts-core
|
- fonts-unfonts-core
|
||||||
- ttf-indic-fonts
|
- ttf-indic-fonts
|
||||||
- fonts-thai-tlwg
|
- fonts-thai-tlwg
|
||||||
- fonts-lklug-sinhala
|
- fonts-lklug-sinhala
|
||||||
- git
|
- git
|
||||||
- libjpeg-turbo8-dev
|
- libjpeg-turbo8-dev
|
||||||
- zlib1g-dev
|
- zlib1g-dev
|
||||||
- gcc
|
- gcc
|
||||||
- libpython3.4-dev
|
- libpython3.4-dev
|
||||||
- adobe-flashplugin
|
- adobe-flashplugin
|
||||||
- name: install Xvnc upstart config /etc/init/Xvnc.conf
|
- name: install Xvnc upstart config /etc/init/Xvnc.conf
|
||||||
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
|
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
|
||||||
become: true
|
become: true
|
||||||
notify:
|
notify:
|
||||||
- restart Xvnc
|
- restart Xvnc
|
||||||
|
- name: mkdir {{venv_root}}/websockify-ve34
|
||||||
|
become: true
|
||||||
|
file: path={{venv_root}}/websockify-ve34 state=directory owner={{user}}
|
||||||
- name: install websockify in virtualenv
|
- name: install websockify in virtualenv
|
||||||
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
|
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
|
||||||
virtualenv=/home/vagrant/websockify-ve34
|
virtualenv={{venv_root}}/websockify-ve34
|
||||||
virtualenv_python=python3.4
|
virtualenv_python=python3.4
|
||||||
extra_args='--no-input --upgrade --pre'
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
become: true
|
||||||
|
become_user: '{{user}}'
|
||||||
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
|
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
|
||||||
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
|
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
|
||||||
become: true
|
become: true
|
||||||
notify:
|
notify:
|
||||||
- restart vnc-websock
|
- restart vnc-websock
|
||||||
|
- name: mkdir {{venv_root}}/brozzler-ve34
|
||||||
|
become: true
|
||||||
|
file: path={{venv_root}}/brozzler-ve34 state=directory owner={{user}}
|
||||||
- name: install brozzler in virtualenv
|
- name: install brozzler in virtualenv
|
||||||
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||||
name='-e /brozzler'
|
name='{{brozzler_pip_name}}'
|
||||||
virtualenv=/home/vagrant/brozzler-ve34
|
virtualenv={{venv_root}}/brozzler-ve34
|
||||||
virtualenv_python=python3.4
|
virtualenv_python=python3.4
|
||||||
extra_args='--no-input --upgrade --pre'
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
become: true
|
||||||
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart brozzler-worker
|
- restart brozzler-worker
|
||||||
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
|
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
|
||||||
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
|
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
|
||||||
become: true
|
become: true
|
||||||
notify:
|
notify:
|
||||||
- restart brozzler-worker
|
- restart brozzler-worker
|
|
@ -3,11 +3,11 @@ description "Xvnc"
|
||||||
start on runlevel [2345]
|
start on runlevel [2345]
|
||||||
stop on runlevel [!2345]
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
setuid vagrant
|
setuid {{user}}
|
||||||
|
|
||||||
console log
|
console log
|
||||||
|
|
||||||
exec nice Xvnc4 :1 -auth /tmp/Xauthority.vagrant \
|
exec nice Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
|
||||||
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
|
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
|
||||||
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
|
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
|
||||||
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0
|
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0
|
|
@ -4,13 +4,13 @@ start on runlevel [2345]
|
||||||
stop on runlevel [!2345]
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
env DISPLAY=:1
|
env DISPLAY=:1
|
||||||
env PATH=/home/vagrant/brozzler-ve34/bin:/usr/bin:/bin
|
env PATH={{venv_root}}/brozzler-ve34/bin:/usr/bin:/bin
|
||||||
env PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages
|
env PYTHONPATH={{venv_root}}/brozzler-ve34/lib/python3.4/site-packages
|
||||||
env LANG=C.UTF-8
|
env LANG=C.UTF-8
|
||||||
|
|
||||||
setuid vagrant
|
setuid {{user}}
|
||||||
|
|
||||||
# console log
|
console log
|
||||||
|
|
||||||
# depends on vnc server
|
# depends on vnc server
|
||||||
start on started Xvnc
|
start on started Xvnc
|
||||||
|
@ -20,4 +20,4 @@ kill timeout 60
|
||||||
|
|
||||||
exec nice brozzler-worker \
|
exec nice brozzler-worker \
|
||||||
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
||||||
--max-browsers=4 >>/vagrant/logs/brozzler-worker.log 2>&1
|
--max-browsers=4
|
|
@ -3,12 +3,12 @@ description "vnc-websock"
|
||||||
start on runlevel [2345]
|
start on runlevel [2345]
|
||||||
stop on runlevel [!2345]
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
setuid vagrant
|
setuid {{user}}
|
||||||
|
|
||||||
console log
|
console log
|
||||||
|
|
||||||
env PYTHONPATH=/home/vagrant/websockify-ve34/lib/python3.4/site-packages
|
env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages
|
||||||
env PATH=/home/vagrant/websockify-ve34/bin:/usr/bin:/bin
|
env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin
|
||||||
|
|
||||||
# port 8901 is hard-coded in brozzler/webconsole/static/partials/workers.html
|
# port 8901 is hard-coded in brozzler/webconsole/static/partials/workers.html
|
||||||
exec nice websockify 0.0.0.0:8901 localhost:5901
|
exec nice websockify 0.0.0.0:8901 localhost:5901
|
7
ansible/roles/common/defaults/main.yml
Normal file
7
ansible/roles/common/defaults/main.yml
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# variables default values, these can be overridden in the ansible inventory
|
||||||
|
# or various other places
|
||||||
|
---
|
||||||
|
user: brozzler
|
||||||
|
venv_root: /opt
|
||||||
|
warcs_dir: /var/tmp/warcs
|
||||||
|
brozzler_pip_name: brozzler # get it from pypi by default
|
|
@ -19,6 +19,12 @@
|
||||||
## command: python3 setup.py install chdir=/tmp/pip-8.1.2
|
## command: python3 setup.py install chdir=/tmp/pip-8.1.2
|
||||||
## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
|
## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
|
||||||
## become: true
|
## become: true
|
||||||
- name: mkdir /vagrant/logs
|
- command: id {{user}}
|
||||||
file: path=/vagrant/logs state=directory
|
register: id_user
|
||||||
|
ignore_errors: true
|
||||||
|
changed_when: false
|
||||||
|
- name: ensure service user {{user}} exists
|
||||||
|
user: name={{user}} system=yes createhome=no home=/nonexistent
|
||||||
|
shell=/usr/sbin/nologin
|
||||||
become: true
|
become: true
|
||||||
|
when: id_user|failed
|
3
ansible/roles/pywb/meta/main.yml
Normal file
3
ansible/roles/pywb/meta/main.yml
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
dependencies:
|
||||||
|
- role: common
|
|
@ -1,16 +1,24 @@
|
||||||
---
|
---
|
||||||
|
- name: mkdir {{venv_root}}/pywb-ve34
|
||||||
|
file: path={{venv_root}}/pywb-ve34 state=directory
|
||||||
|
owner={{user}}
|
||||||
|
become: true
|
||||||
- name: install pywb in virtualenv
|
- name: install pywb in virtualenv
|
||||||
pip: name=pywb
|
pip: name=pywb
|
||||||
virtualenv=/home/vagrant/pywb-ve34
|
virtualenv={{venv_root}}/pywb-ve34
|
||||||
virtualenv_python=python3.4
|
virtualenv_python=python3.4
|
||||||
extra_args='--no-input --upgrade --pre'
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
become: true
|
||||||
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart pywb
|
- restart pywb
|
||||||
- name: install brozzler in pywb virtualenv
|
- name: install brozzler in pywb virtualenv
|
||||||
pip: name='-e /brozzler'
|
pip: name='{{brozzler_pip_name}}'
|
||||||
virtualenv=/home/vagrant/pywb-ve34
|
virtualenv={{venv_root}}/pywb-ve34
|
||||||
virtualenv_python=python3.4
|
virtualenv_python=python3.4
|
||||||
extra_args='--no-input --upgrade --pre'
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
become: true
|
||||||
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart pywb
|
- restart pywb
|
||||||
- name: pywb config file /etc/pywb.yml
|
- name: pywb config file /etc/pywb.yml
|
14
ansible/roles/pywb/templates/pywb.conf.j2
Normal file
14
ansible/roles/pywb/templates/pywb.conf.j2
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
description "pywb"
|
||||||
|
|
||||||
|
start on runlevel [2345]
|
||||||
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
|
env PYTHONPATH={{venv_root}}/pywb-ve34/lib/python3.4/site-packages
|
||||||
|
env PATH={{venv_root}}/pywb-ve34/bin:/usr/bin:/bin
|
||||||
|
env PYWB_CONFIG_FILE=/etc/pywb.yml
|
||||||
|
|
||||||
|
setuid {{user}}
|
||||||
|
|
||||||
|
console log
|
||||||
|
|
||||||
|
exec nice brozzler-wayback
|
|
@ -1,10 +1,13 @@
|
||||||
archive_paths: /vagrant/warcs/
|
archive_paths: {{warcs_dir}}/ # pywb will fail without a trailing slash
|
||||||
collections:
|
collections:
|
||||||
brozzler:
|
brozzler:
|
||||||
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
||||||
db: brozzler
|
db: brozzler
|
||||||
servers: [localhost]
|
|
||||||
table: captures
|
table: captures
|
||||||
|
servers:
|
||||||
|
{% for node in groups['rethinkdb'] %}
|
||||||
|
- {{node}}
|
||||||
|
{% endfor %}
|
||||||
enable_auto_colls: false
|
enable_auto_colls: false
|
||||||
enable_cdx_api: true
|
enable_cdx_api: true
|
||||||
framed_replay: true
|
framed_replay: true
|
|
@ -6,18 +6,24 @@
|
||||||
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
|
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
|
||||||
state=present
|
state=present
|
||||||
become: true
|
become: true
|
||||||
|
- apt: update_cache=yes
|
||||||
|
become: true
|
||||||
- name: ensure rethinkdb package is installed
|
- name: ensure rethinkdb package is installed
|
||||||
apt: name=rethinkdb state=present
|
apt: name=rethinkdb state=present
|
||||||
become: true
|
become: true
|
||||||
notify:
|
notify:
|
||||||
- restart rethinkdb
|
- restart rethinkdb
|
||||||
# XXX rethinkdb fails to start in spite of this, I think because /vagrant
|
|
||||||
# gets mounted too late, and it tries to log there
|
|
||||||
- name: ensure rethinkdb starts on reboot
|
- name: ensure rethinkdb starts on reboot
|
||||||
service: name=rethinkdb enabled=yes
|
service: name=rethinkdb enabled=yes
|
||||||
|
- stat: path=/var/log/rethinkdb.log
|
||||||
|
register: p
|
||||||
|
- name: ensure user rethinkdb owns /var/log/rethinkdb.log
|
||||||
|
file: path=/var/log/rethinkdb.log owner=rethinkdb state=touch mode=0644
|
||||||
|
when: not p.stat.exists
|
||||||
|
become: true
|
||||||
- name: ensure rethinkdb instance config file is installed
|
- name: ensure rethinkdb instance config file is installed
|
||||||
template: src=templates/rethinkdb-brozzler-vagrant-1.conf.j2
|
template: src=templates/rethinkdb-brozzler.conf.j2
|
||||||
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-vagrant-1.conf
|
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler.conf
|
||||||
become: true
|
become: true
|
||||||
notify:
|
notify:
|
||||||
- restart rethinkdb
|
- restart rethinkdb
|
|
@ -0,0 +1,6 @@
|
||||||
|
bind=0.0.0.0
|
||||||
|
# directory=/var/lib/rethinkdb
|
||||||
|
log-file=/var/log/rethinkdb.log
|
||||||
|
{% for node in groups['rethinkdb'] %}
|
||||||
|
join={{node}}:29015
|
||||||
|
{% endfor %}
|
2
ansible/roles/warcprox/defaults/main.yml
Normal file
2
ansible/roles/warcprox/defaults/main.yml
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
---
|
||||||
|
work_dir: /var/tmp
|
4
ansible/roles/warcprox/handlers/main.yml
Normal file
4
ansible/roles/warcprox/handlers/main.yml
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
---
|
||||||
|
- name: restart warcprox
|
||||||
|
service: name=warcprox state=restarted
|
||||||
|
become: true
|
3
ansible/roles/warcprox/meta/main.yml
Normal file
3
ansible/roles/warcprox/meta/main.yml
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
---
|
||||||
|
dependencies:
|
||||||
|
- role: common
|
|
@ -11,11 +11,16 @@
|
||||||
- libssl-dev
|
- libssl-dev
|
||||||
- tor
|
- tor
|
||||||
- git
|
- git
|
||||||
|
- name: mkdir {{venv_root}}/warcprox-ve34
|
||||||
|
become: true
|
||||||
|
file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}}
|
||||||
- name: install warcprox in virtualenv
|
- name: install warcprox in virtualenv
|
||||||
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
|
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
|
||||||
virtualenv=/home/vagrant/warcprox-ve34
|
virtualenv={{venv_root}}/warcprox-ve34
|
||||||
virtualenv_python=python3.4
|
virtualenv_python=python3.4
|
||||||
extra_args='--no-input --upgrade --pre'
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
become: true
|
||||||
|
become_user: '{{user}}'
|
||||||
notify:
|
notify:
|
||||||
- restart warcprox
|
- restart warcprox
|
||||||
- name: install upstart config /etc/init/warcprox.conf
|
- name: install upstart config /etc/init/warcprox.conf
|
24
ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
24
ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
description "warcprox"
|
||||||
|
|
||||||
|
start on runlevel [2345]
|
||||||
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
|
env PYTHONPATH={{venv_root}}/warcprox-ve34/lib/python3.4/site-packages
|
||||||
|
env PATH={{venv_root}}/warcprox-ve34/bin:/usr/bin:/bin
|
||||||
|
|
||||||
|
# by default warcprox creates some files/dirs relative to cwd
|
||||||
|
chdir {{work_dir}}
|
||||||
|
setuid {{user}}
|
||||||
|
|
||||||
|
console log
|
||||||
|
|
||||||
|
# --profile
|
||||||
|
exec nice warcprox \
|
||||||
|
--dir={{warcs_dir}} \
|
||||||
|
--base32 \
|
||||||
|
--gzip \
|
||||||
|
--rollover-idle-time=180 \
|
||||||
|
--onion-tor-socks-proxy=localhost:9050 \
|
||||||
|
--rethinkdb-servers={{groups['rethinkdb']|join(',')}} \
|
||||||
|
--rethinkdb-db=brozzler \
|
||||||
|
--rethinkdb-big-table
|
|
@ -186,7 +186,7 @@ class Browser:
|
||||||
cookie_location = os.path.join(
|
cookie_location = os.path.join(
|
||||||
self._work_dir.name, "chrome-user-data", "Default", "Cookies")
|
self._work_dir.name, "chrome-user-data", "Default", "Cookies")
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"marking cookies persistent then reading file into memory: %s ",
|
"marking cookies persistent then reading file into memory: %s",
|
||||||
cookie_location)
|
cookie_location)
|
||||||
try:
|
try:
|
||||||
with sqlite3.connect(cookie_location) as conn:
|
with sqlite3.connect(cookie_location) as conn:
|
||||||
|
@ -230,6 +230,7 @@ class Browser:
|
||||||
|
|
||||||
def browse_page(
|
def browse_page(
|
||||||
self, url, extra_headers=None, behavior_parameters=None,
|
self, url, extra_headers=None, behavior_parameters=None,
|
||||||
|
user_agent=None,
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
on_url_change=None):
|
on_url_change=None):
|
||||||
"""
|
"""
|
||||||
|
@ -244,6 +245,7 @@ class Browser:
|
||||||
raise BrowsingException("browser has not been started")
|
raise BrowsingException("browser has not been started")
|
||||||
self.url = url
|
self.url = url
|
||||||
self.extra_headers = extra_headers
|
self.extra_headers = extra_headers
|
||||||
|
self.user_agent = user_agent
|
||||||
self.on_request = on_request
|
self.on_request = on_request
|
||||||
self.on_screenshot = on_screenshot
|
self.on_screenshot = on_screenshot
|
||||||
self.on_url_change = on_url_change
|
self.on_url_change = on_url_change
|
||||||
|
@ -459,11 +461,14 @@ __brzl_compileOutlinks(window).join(' ');
|
||||||
self.send_to_chrome(method="Runtime.enable")
|
self.send_to_chrome(method="Runtime.enable")
|
||||||
|
|
||||||
headers = self.extra_headers or {}
|
headers = self.extra_headers or {}
|
||||||
headers['Accept-Encoding'] = 'gzip, deflate'
|
headers['Accept-Encoding'] = 'identity'
|
||||||
self.send_to_chrome(
|
self.send_to_chrome(
|
||||||
method="Network.setExtraHTTPHeaders",
|
method="Network.setExtraHTTPHeaders",
|
||||||
params={"headers":headers})
|
params={"headers":headers})
|
||||||
|
|
||||||
|
if self.user_agent:
|
||||||
|
self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent})
|
||||||
|
|
||||||
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
||||||
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
||||||
|
|
||||||
|
|
|
@ -86,7 +86,7 @@ def _configure_logging(args):
|
||||||
warnings.simplefilter(
|
warnings.simplefilter(
|
||||||
'ignore', category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
'ignore', category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||||
|
|
||||||
def suggest_default_chome_exe():
|
def suggest_default_chrome_exe():
|
||||||
# mac os x application executable paths
|
# mac os x application executable paths
|
||||||
for path in [
|
for path in [
|
||||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||||
|
@ -118,7 +118,7 @@ def brozzle_page():
|
||||||
arg_parser.add_argument('url', metavar='URL', help='page url')
|
arg_parser.add_argument('url', metavar='URL', help='page url')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
'-e', '--chrome-exe', dest='chrome_exe',
|
||||||
default=suggest_default_chome_exe(),
|
default=suggest_default_chrome_exe(),
|
||||||
help='executable to use to invoke chrome')
|
help='executable to use to invoke chrome')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--proxy', dest='proxy', default=None,
|
'--proxy', dest='proxy', default=None,
|
||||||
|
@ -182,7 +182,12 @@ def brozzler_new_job():
|
||||||
r = rethinkstuff.Rethinker(
|
r = rethinkstuff.Rethinker(
|
||||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||||
frontier = brozzler.RethinkDbFrontier(r)
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
try:
|
||||||
|
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
||||||
|
except brozzler.job.InvalidJobConf as e:
|
||||||
|
print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr)
|
||||||
|
print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
def brozzler_new_site():
|
def brozzler_new_site():
|
||||||
'''
|
'''
|
||||||
|
@ -238,7 +243,7 @@ def brozzler_worker():
|
||||||
_add_rethinkdb_options(arg_parser)
|
_add_rethinkdb_options(arg_parser)
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
'-e', '--chrome-exe', dest='chrome_exe',
|
||||||
default=suggest_default_chome_exe(),
|
default=suggest_default_chrome_exe(),
|
||||||
help='executable to use to invoke chrome')
|
help='executable to use to invoke chrome')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-n', '--max-browsers', dest='max_browsers', default='1',
|
'-n', '--max-browsers', dest='max_browsers', default='1',
|
||||||
|
|
|
@ -87,7 +87,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||||
# brozzler-worker args
|
# brozzler-worker args
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-e', '--chrome-exe', dest='chrome_exe',
|
'-e', '--chrome-exe', dest='chrome_exe',
|
||||||
default=brozzler.cli.suggest_default_chome_exe(),
|
default=brozzler.cli.suggest_default_chrome_exe(),
|
||||||
help='executable to use to invoke chrome')
|
help='executable to use to invoke chrome')
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-n', '--max-browsers', dest='max_browsers',
|
'-n', '--max-browsers', dest='max_browsers',
|
||||||
|
|
|
@ -241,6 +241,15 @@ class RethinkDbFrontier:
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def site(self, id):
|
||||||
|
if id is None:
|
||||||
|
return None
|
||||||
|
result = self.r.table("sites").get(id).run()
|
||||||
|
if result:
|
||||||
|
return brozzler.Site(**result)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
def honor_stop_request(self, job_id):
|
def honor_stop_request(self, job_id):
|
||||||
"""Raises brozzler.CrawlJobStopped if stop has been requested."""
|
"""Raises brozzler.CrawlJobStopped if stop has been requested."""
|
||||||
job = self.job(job_id)
|
job = self.job(job_id)
|
||||||
|
|
|
@ -24,6 +24,28 @@ import json
|
||||||
import datetime
|
import datetime
|
||||||
import uuid
|
import uuid
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
|
import os
|
||||||
|
import cerberus
|
||||||
|
import urllib
|
||||||
|
|
||||||
|
def load_schema():
|
||||||
|
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||||
|
with open(schema_file) as f:
|
||||||
|
return yaml.load(f)
|
||||||
|
|
||||||
|
class JobValidator(cerberus.Validator):
|
||||||
|
def _validate_type_url(self, value):
|
||||||
|
url = urllib.parse.urlparse(value)
|
||||||
|
return url.scheme in ('http', 'https', 'ftp')
|
||||||
|
|
||||||
|
class InvalidJobConf(Exception):
|
||||||
|
def __init__(self, errors):
|
||||||
|
self.errors = errors
|
||||||
|
|
||||||
|
def validate_conf(job_conf, schema=load_schema()):
|
||||||
|
v = JobValidator(schema)
|
||||||
|
if not v.validate(job_conf):
|
||||||
|
raise InvalidJobConf(v.errors)
|
||||||
|
|
||||||
def merge(a, b):
|
def merge(a, b):
|
||||||
if isinstance(a, dict) and isinstance(b, dict):
|
if isinstance(a, dict) and isinstance(b, dict):
|
||||||
|
@ -45,6 +67,7 @@ def new_job_file(frontier, job_conf_file):
|
||||||
new_job(frontier, job_conf)
|
new_job(frontier, job_conf)
|
||||||
|
|
||||||
def new_job(frontier, job_conf):
|
def new_job(frontier, job_conf):
|
||||||
|
validate_conf(job_conf)
|
||||||
job = Job(
|
job = Job(
|
||||||
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||||
started=rethinkstuff.utcnow())
|
started=rethinkstuff.utcnow())
|
||||||
|
@ -52,8 +75,6 @@ def new_job(frontier, job_conf):
|
||||||
sites = []
|
sites = []
|
||||||
for seed_conf in job_conf["seeds"]:
|
for seed_conf in job_conf["seeds"]:
|
||||||
merged_conf = merge(seed_conf, job_conf)
|
merged_conf = merge(seed_conf, job_conf)
|
||||||
# XXX check for unknown settings, invalid url, etc
|
|
||||||
|
|
||||||
site = brozzler.Site(
|
site = brozzler.Site(
|
||||||
job_id=job.id, seed=merged_conf["url"],
|
job_id=job.id, seed=merged_conf["url"],
|
||||||
scope=merged_conf.get("scope"),
|
scope=merged_conf.get("scope"),
|
||||||
|
@ -64,7 +85,8 @@ def new_job(frontier, job_conf):
|
||||||
"enable_warcprox_features"),
|
"enable_warcprox_features"),
|
||||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||||
metadata=merged_conf.get("metadata"),
|
metadata=merged_conf.get("metadata"),
|
||||||
remember_outlinks=merged_conf.get("remember_outlinks"))
|
remember_outlinks=merged_conf.get("remember_outlinks"),
|
||||||
|
user_agent=merged_conf.get("user_agent"))
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
|
|
||||||
# insert all the sites into database before the job
|
# insert all the sites into database before the job
|
||||||
|
|
82
brozzler/job_schema.yaml
Normal file
82
brozzler/job_schema.yaml
Normal file
|
@ -0,0 +1,82 @@
|
||||||
|
id:
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
|
||||||
|
<<: &multi_level_options
|
||||||
|
time_limit:
|
||||||
|
type: number
|
||||||
|
min: 0
|
||||||
|
|
||||||
|
enable_warcprox_features:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
ignore_robots:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
warcprox_meta:
|
||||||
|
type: dict
|
||||||
|
nullable: true
|
||||||
|
|
||||||
|
proxy:
|
||||||
|
type: string
|
||||||
|
nullable: true
|
||||||
|
|
||||||
|
scope:
|
||||||
|
type: dict
|
||||||
|
schema:
|
||||||
|
surt:
|
||||||
|
type: string
|
||||||
|
|
||||||
|
accepts:
|
||||||
|
type: list
|
||||||
|
schema: &scope_rule
|
||||||
|
type: dict
|
||||||
|
schema:
|
||||||
|
|
||||||
|
domain:
|
||||||
|
type: string
|
||||||
|
|
||||||
|
url_match:
|
||||||
|
type: string
|
||||||
|
allowed:
|
||||||
|
- STRING_MATCH
|
||||||
|
- SURT_MATCH
|
||||||
|
- REGEX_MATCH
|
||||||
|
|
||||||
|
value:
|
||||||
|
type: string
|
||||||
|
dependencies:
|
||||||
|
- url_match
|
||||||
|
|
||||||
|
blocks:
|
||||||
|
type: list
|
||||||
|
schema: *scope_rule
|
||||||
|
|
||||||
|
max_hops:
|
||||||
|
type: integer
|
||||||
|
|
||||||
|
max_hops_off_surt:
|
||||||
|
type: integer
|
||||||
|
|
||||||
|
remember_outlinks:
|
||||||
|
type: boolean
|
||||||
|
|
||||||
|
metadata:
|
||||||
|
type: dict
|
||||||
|
|
||||||
|
user_agent:
|
||||||
|
type: string
|
||||||
|
|
||||||
|
seeds:
|
||||||
|
type: list
|
||||||
|
required: true
|
||||||
|
minlength: 1
|
||||||
|
schema:
|
||||||
|
type: dict
|
||||||
|
schema:
|
||||||
|
|
||||||
|
url:
|
||||||
|
type: url
|
||||||
|
required: true
|
||||||
|
|
||||||
|
<<: *multi_level_options
|
|
@ -63,13 +63,16 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||||
# short-circuit this step and create the CDXObject directly
|
# short-circuit this step and create the CDXObject directly
|
||||||
blob = {
|
blob = {
|
||||||
'url': record['url'],
|
'url': record['url'],
|
||||||
'mime': record['content_type'],
|
|
||||||
'status': str(record['response_code']),
|
'status': str(record['response_code']),
|
||||||
'digest': record['sha1base32'],
|
'digest': record['sha1base32'],
|
||||||
'length': str(record['length']), # XXX is this the right length?
|
'length': str(record['length']), # XXX is this the right length?
|
||||||
'offset': str(record['offset']),
|
'offset': str(record['offset']),
|
||||||
'filename': record['filename'],
|
'filename': record['filename'],
|
||||||
}
|
}
|
||||||
|
if record['warc_type'] != 'revisit':
|
||||||
|
blob['mime'] = record['content_type']
|
||||||
|
else:
|
||||||
|
blob['mime'] = 'warc/revisit'
|
||||||
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
||||||
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
||||||
record['canon_surt'], record['timestamp'],
|
record['canon_surt'], record['timestamp'],
|
||||||
|
|
|
@ -42,6 +42,8 @@ def _robots_cache(site):
|
||||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||||
if site.extra_headers():
|
if site.extra_headers():
|
||||||
req_sesh.headers.update(site.extra_headers())
|
req_sesh.headers.update(site.extra_headers())
|
||||||
|
if site.user_agent:
|
||||||
|
req_sesh.headers['User-Agent'] = site.user_agent
|
||||||
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||||
|
|
||||||
return _robots_caches[site.id]
|
return _robots_caches[site.id]
|
||||||
|
|
|
@ -91,7 +91,8 @@ class Site(brozzler.BaseDictable):
|
||||||
enable_warcprox_features=False, reached_limit=None,
|
enable_warcprox_features=False, reached_limit=None,
|
||||||
status="ACTIVE", claimed=False, start_time=None,
|
status="ACTIVE", claimed=False, start_time=None,
|
||||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None):
|
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
||||||
|
cookie_db=None, user_agent=None):
|
||||||
|
|
||||||
self.seed = seed
|
self.seed = seed
|
||||||
self.id = id
|
self.id = id
|
||||||
|
@ -111,6 +112,7 @@ class Site(brozzler.BaseDictable):
|
||||||
self.metadata = metadata
|
self.metadata = metadata
|
||||||
self.remember_outlinks = remember_outlinks
|
self.remember_outlinks = remember_outlinks
|
||||||
self.cookie_db = cookie_db
|
self.cookie_db = cookie_db
|
||||||
|
self.user_agent = user_agent
|
||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
|
|
|
@ -260,6 +260,7 @@ class BrozzlerWorker:
|
||||||
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
||||||
outlinks = browser.browse_page(
|
outlinks = browser.browse_page(
|
||||||
page.url, extra_headers=site.extra_headers(),
|
page.url, extra_headers=site.extra_headers(),
|
||||||
|
user_agent=site.user_agent,
|
||||||
on_screenshot=_on_screenshot,
|
on_screenshot=_on_screenshot,
|
||||||
on_url_change=page.note_redirect)
|
on_url_change=page.note_redirect)
|
||||||
return outlinks
|
return outlinks
|
||||||
|
@ -312,7 +313,8 @@ class BrozzlerWorker:
|
||||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||||
socket.gethostname(), browser.chrome_port))
|
socket.gethostname(), browser.chrome_port))
|
||||||
outlinks = self.brozzle_page(browser, site, page)
|
outlinks = self.brozzle_page(browser, site, page)
|
||||||
site.cookie_db=browser.persist_and_read_cookie_db()
|
if browser.is_running():
|
||||||
|
site.cookie_db = browser.persist_and_read_cookie_db()
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(
|
self._frontier.scope_and_schedule_outlinks(
|
||||||
site, page, outlinks)
|
site, page, outlinks)
|
||||||
|
|
12
job-conf.rst
12
job-conf.rst
|
@ -168,6 +168,18 @@ ignore_robots
|
||||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||||
blocked by robots.txt rules.
|
blocked by robots.txt rules.
|
||||||
|
|
||||||
|
user_agent
|
||||||
|
----------
|
||||||
|
+-----------------------+---------+----------+---------+
|
||||||
|
| scope | type | required | default |
|
||||||
|
+=======================+=========+==========+=========+
|
||||||
|
| seed-level, top-level | string | no | *none* |
|
||||||
|
+-----------------------+---------+----------+---------+
|
||||||
|
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||||
|
It's good ettiquette to include a project URL with a notice to webmasters that
|
||||||
|
explains why you're crawling, how to block the crawler robots.txt and how to
|
||||||
|
contact the operator if the crawl is causing problems.
|
||||||
|
|
||||||
warcprox_meta
|
warcprox_meta
|
||||||
-------------
|
-------------
|
||||||
+-----------------------+------------+----------+---------+
|
+-----------------------+------------+----------+---------+
|
||||||
|
|
11
setup.py
11
setup.py
|
@ -32,16 +32,16 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b6.dev87',
|
version='1.1b7.dev101',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
author_email='nlevitt@archive.org',
|
author_email='nlevitt@archive.org',
|
||||||
long_description=open('README.rst', encoding='UTF-8').read(),
|
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
||||||
license='Apache License 2.0',
|
license='Apache License 2.0',
|
||||||
packages=['brozzler', 'brozzler.webconsole'],
|
packages=['brozzler', 'brozzler.webconsole'],
|
||||||
package_data={
|
package_data={
|
||||||
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml'],
|
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
|
||||||
'brozzler.webconsole': find_package_data('brozzler.webconsole'),
|
'brozzler.webconsole': find_package_data('brozzler.webconsole'),
|
||||||
},
|
},
|
||||||
entry_points={
|
entry_points={
|
||||||
|
@ -62,11 +62,12 @@ setuptools.setup(
|
||||||
'reppy',
|
'reppy',
|
||||||
'requests',
|
'requests',
|
||||||
'websocket-client',
|
'websocket-client',
|
||||||
'pillow',
|
'pillow==3.3.0',
|
||||||
'surt>=0.3.0',
|
'surt>=0.3.0',
|
||||||
'rethinkstuff>=0.1.5',
|
'rethinkstuff>=0.1.5',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3,<2.4',
|
||||||
'psutil',
|
'psutil==4.3.0',
|
||||||
|
'cerberus==1.0.1',
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'webconsole': ['flask>=0.11', 'gunicorn'],
|
'webconsole': ['flask>=0.11', 'gunicorn'],
|
||||||
|
|
|
@ -1,8 +1,7 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
'''
|
'''
|
||||||
cluster-integration-tests.py - integration tests for a brozzler cluster,
|
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
|
||||||
expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
|
warcprox, pywb, rethinkdb and other dependencies to be running already
|
||||||
running already
|
|
||||||
|
|
||||||
Copyright (C) 2016 Internet Archive
|
Copyright (C) 2016 Internet Archive
|
||||||
|
|
||||||
|
@ -26,6 +25,10 @@ import urllib.request
|
||||||
import os
|
import os
|
||||||
import socket
|
import socket
|
||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
|
import time
|
||||||
|
import brozzler
|
||||||
|
import datetime
|
||||||
|
import requests
|
||||||
|
|
||||||
@pytest.fixture(scope='module')
|
@pytest.fixture(scope='module')
|
||||||
def httpd(request):
|
def httpd(request):
|
||||||
|
@ -53,13 +56,13 @@ def test_httpd(httpd):
|
||||||
'''
|
'''
|
||||||
payload1 = content2 = None
|
payload1 = content2 = None
|
||||||
with urllib.request.urlopen(
|
with urllib.request.urlopen(
|
||||||
'http://localhost:%s/' % httpd.server_port) as response:
|
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
payload1 = response.read()
|
payload1 = response.read()
|
||||||
assert payload1
|
assert payload1
|
||||||
|
|
||||||
with urllib.request.urlopen(
|
with urllib.request.urlopen(
|
||||||
'http://localhost:%s/' % httpd.server_port) as response:
|
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
payload2 = response.read()
|
payload2 = response.read()
|
||||||
assert payload2
|
assert payload2
|
||||||
|
@ -68,21 +71,71 @@ def test_httpd(httpd):
|
||||||
|
|
||||||
def test_services_up():
|
def test_services_up():
|
||||||
'''Check that the expected services are up and running.'''
|
'''Check that the expected services are up and running.'''
|
||||||
# check that warcprox is listening
|
|
||||||
with socket.socket() as s:
|
|
||||||
# if the connect fails an exception is raised and the test fails
|
|
||||||
s.connect(('localhost', 8000))
|
|
||||||
|
|
||||||
### # check that pywb is listening
|
|
||||||
### with socket.socket() as s:
|
|
||||||
### # if the connect fails an exception is raised and the test fails
|
|
||||||
### s.connect(('localhost', 8880))
|
|
||||||
|
|
||||||
# check that rethinkdb is listening and looks sane
|
# check that rethinkdb is listening and looks sane
|
||||||
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
|
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
|
||||||
tbls = r.table_list().run()
|
tbls = r.table_list().run()
|
||||||
assert len(tbls) > 10
|
assert len(tbls) > 10
|
||||||
|
|
||||||
def test_brozzle_site(httpd):
|
# check that warcprox is listening
|
||||||
pass
|
with socket.socket() as s:
|
||||||
|
# if the connect fails an exception is raised and the test fails
|
||||||
|
s.connect(('localhost', 8000))
|
||||||
|
|
||||||
|
# check that pywb is listening
|
||||||
|
with socket.socket() as s:
|
||||||
|
# if the connect fails an exception is raised and the test fails
|
||||||
|
s.connect(('localhost', 8880))
|
||||||
|
|
||||||
|
# check that brozzler webconsole is listening
|
||||||
|
with socket.socket() as s:
|
||||||
|
# if the connect fails an exception is raised and the test fails
|
||||||
|
s.connect(('localhost', 8881))
|
||||||
|
|
||||||
|
def test_brozzle_site(httpd):
|
||||||
|
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
|
||||||
|
site = brozzler.Site(
|
||||||
|
seed='http://localhost:%s/' % httpd.server_port,
|
||||||
|
proxy='localhost:8000', enable_warcprox_features=True,
|
||||||
|
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||||
|
|
||||||
|
# the two pages we expect to be crawled
|
||||||
|
page1 = 'http://localhost:%s/' % httpd.server_port
|
||||||
|
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
||||||
|
|
||||||
|
assert site.id is None
|
||||||
|
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(r)
|
||||||
|
brozzler.new_site(frontier, site)
|
||||||
|
assert site.id is not None
|
||||||
|
assert len(list(frontier.site_pages(site.id))) == 1
|
||||||
|
|
||||||
|
# the site should be brozzled fairly quickly
|
||||||
|
start = time.time()
|
||||||
|
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||||
|
time.sleep(0.5)
|
||||||
|
site = frontier.site(site.id)
|
||||||
|
assert site.status == 'FINISHED'
|
||||||
|
|
||||||
|
# check that we got the two pages we expected
|
||||||
|
pages = list(frontier.site_pages(site.id))
|
||||||
|
assert len(pages) == 2
|
||||||
|
assert {page.url for page in pages} == {
|
||||||
|
'http://localhost:%s/' % httpd.server_port,
|
||||||
|
'http://localhost:%s/file1.txt' % httpd.server_port }
|
||||||
|
|
||||||
|
# take a look at the captures table
|
||||||
|
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||||
|
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||||
|
assert page1 in captures_by_url
|
||||||
|
assert '%srobots.txt' % page1 in captures_by_url
|
||||||
|
assert page2 in captures_by_url
|
||||||
|
assert 'screenshot:%s' % page1 in captures_by_url
|
||||||
|
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||||
|
# no screenshots of plaintext
|
||||||
|
|
||||||
|
# check pywb
|
||||||
|
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||||
|
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||||
|
expected_payload = open(os.path.join(
|
||||||
|
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||||
|
assert requests.get(wb_url).content == expected_payload
|
||||||
|
|
4
vagrant/Vagrantfile
vendored
4
vagrant/Vagrantfile
vendored
|
@ -7,7 +7,7 @@ Vagrant.configure(2) do |config|
|
||||||
config.vm.synced_folder "..", "/brozzler"
|
config.vm.synced_folder "..", "/brozzler"
|
||||||
|
|
||||||
config.vm.provision "ansible" do |ansible|
|
config.vm.provision "ansible" do |ansible|
|
||||||
ansible.inventory_path = "ansible/hosts"
|
ansible.inventory_path = "../ansible/hosts-vagrant"
|
||||||
ansible.playbook = "ansible/playbook.yml"
|
ansible.playbook = "../ansible/playbook.yml"
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -1,16 +0,0 @@
|
||||||
ansible_ssh_private_key_file=.vagrant/machines/10.9.9.9/virtualbox/private_key
|
|
||||||
|
|
||||||
[rethinkdb]
|
|
||||||
10.9.9.9
|
|
||||||
|
|
||||||
[warcprox]
|
|
||||||
10.9.9.9
|
|
||||||
|
|
||||||
[brozzler-worker]
|
|
||||||
10.9.9.9
|
|
||||||
|
|
||||||
[brozzler-webconsole]
|
|
||||||
10.9.9.9
|
|
||||||
|
|
||||||
[pywb]
|
|
||||||
10.9.9.9
|
|
|
@ -1,14 +0,0 @@
|
||||||
description "pywb"
|
|
||||||
|
|
||||||
start on runlevel [2345]
|
|
||||||
stop on runlevel [!2345]
|
|
||||||
|
|
||||||
env PYTHONPATH=/home/vagrant/pywb-ve34/lib/python3.4/site-packages
|
|
||||||
env PATH=/home/vagrant/pywb-ve34/bin:/usr/bin:/bin
|
|
||||||
env PYWB_CONFIG_FILE=/etc/pywb.yml
|
|
||||||
|
|
||||||
setuid vagrant
|
|
||||||
|
|
||||||
# console log
|
|
||||||
|
|
||||||
exec nice brozzler-wayback >>/vagrant/logs/pywb.log 2>&1
|
|
|
@ -1,5 +0,0 @@
|
||||||
runuser=vagrant
|
|
||||||
bind=0.0.0.0
|
|
||||||
# directory=/var/lib/rethinkdb
|
|
||||||
# log-file=/var/log/rethinkdb.log
|
|
||||||
log-file=/vagrant/logs/rethinkdb.log # synced dir
|
|
|
@ -1,14 +0,0 @@
|
||||||
---
|
|
||||||
# - name: start warcprox
|
|
||||||
# environment:
|
|
||||||
# PYTHONPATH: /home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
|
||||||
# PATH: /home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
|
||||||
# args:
|
|
||||||
# executable: /bin/bash
|
|
||||||
# shell: nice warcprox --dir=/vagrant/warcs --base32 --gzip
|
|
||||||
# --rollover-idle-time=180 --cacert=/vagrant/warcprox-ca.pem
|
|
||||||
# --onion-tor-socks-proxy=localhost:9050 --rethinkdb-servers=localhost
|
|
||||||
# --rethinkdb-big-table &> /vagrant/logs/warcprox.out &
|
|
||||||
- name: restart warcprox
|
|
||||||
service: name=warcprox state=restarted
|
|
||||||
become: true
|
|
|
@ -1,26 +0,0 @@
|
||||||
description "warcprox"
|
|
||||||
|
|
||||||
start on runlevel [2345]
|
|
||||||
stop on runlevel [!2345]
|
|
||||||
|
|
||||||
env PYTHONPATH=/home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
|
||||||
env PATH=/home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
|
||||||
|
|
||||||
# by default warcprox creates some files/dirs relative to cwd
|
|
||||||
chdir /home/vagrant
|
|
||||||
setuid vagrant
|
|
||||||
|
|
||||||
# console log
|
|
||||||
|
|
||||||
# --profile
|
|
||||||
exec nice warcprox \
|
|
||||||
--dir=/vagrant/warcs \
|
|
||||||
--base32 \
|
|
||||||
--gzip \
|
|
||||||
--rollover-idle-time=180 \
|
|
||||||
--cacert=/vagrant/warcprox-ca.pem \
|
|
||||||
--onion-tor-socks-proxy=localhost:9050 \
|
|
||||||
--rethinkdb-servers=localhost \
|
|
||||||
--rethinkdb-db=brozzler \
|
|
||||||
--rethinkdb-big-table >>/vagrant/logs/warcprox.log 2>&1
|
|
||||||
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
|
|
@ -1,5 +1,7 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
|
cd $(dirname "${BASH_SOURCE[0]}")
|
||||||
|
|
||||||
echo service status:
|
echo service status:
|
||||||
vagrant ssh -- 'status warcprox ;
|
vagrant ssh -- 'status warcprox ;
|
||||||
status Xvnc ;
|
status Xvnc ;
|
||||||
|
@ -8,5 +10,5 @@ vagrant ssh -- 'status warcprox ;
|
||||||
status vnc-websock'
|
status vnc-websock'
|
||||||
echo
|
echo
|
||||||
|
|
||||||
vagrant ssh -- 'source brozzler-ve34/bin/activate && pip install pytest'
|
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
|
||||||
vagrant ssh -- 'source brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
|
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
|
||||||
|
|
|
@ -6,6 +6,20 @@ queue a job for your vagrant brozzler deployment.
|
||||||
This is a standalone script with no dependencies other than python, and should
|
This is a standalone script with no dependencies other than python, and should
|
||||||
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||||
so we can use the argparse library.
|
so we can use the argparse library.
|
||||||
|
|
||||||
|
Copyright (C) 2016 Internet Archive
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
@ -20,23 +34,17 @@ def main(argv=[]):
|
||||||
help='brozzler job configuration file in yaml')
|
help='brozzler job configuration file in yaml')
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
|
||||||
with open(args.job_conf_file, 'rb') as f:
|
|
||||||
yaml_bytes = f.read()
|
|
||||||
subprocess.call(
|
|
||||||
['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'],
|
|
||||||
stdin=yaml_bytes)
|
|
||||||
|
|
||||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||||
os.chdir(os.path.dirname(__file__))
|
os.chdir(os.path.dirname(__file__))
|
||||||
|
|
||||||
|
with open(args.job_conf_file, 'rb') as f:
|
||||||
|
subprocess.call([
|
||||||
|
'vagrant', 'ssh', '--',
|
||||||
|
'f=`mktemp` && cat > $f && '
|
||||||
|
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
||||||
|
'/home/vagrant/brozzler-ve34/bin/python '
|
||||||
|
'/home/vagrant/brozzler-ve34/bin/brozzler-new-job $f'],
|
||||||
|
stdin=f)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main(sys.argv)
|
main(sys.argv)
|
||||||
|
|
||||||
## # cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
|
||||||
## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
||||||
## cd $script_dir
|
|
||||||
##
|
|
||||||
## vagrant ssh -- \
|
|
||||||
## PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \
|
|
||||||
## /home/vagrant/brozzler-ve34/bin/python \
|
|
||||||
## /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@"
|
|
||||||
|
|
|
@ -10,7 +10,7 @@ This is a standalone script with no dependencies other than python, and should
|
||||||
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||||
so we can use the argparse library.
|
so we can use the argparse library.
|
||||||
|
|
||||||
Copyright (C) 2014-2016 Internet Archive
|
Copyright (C) 2016 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue