mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'encodingFixIdentity' (and lots new from master) into qa
This commit is contained in:
commit
d30cd52c47
66
README.rst
66
README.rst
@ -136,6 +136,72 @@ To start the app, run
|
||||
|
||||
See ``brozzler-webconsole --help`` for configuration options.
|
||||
|
||||
Headless Chromium
|
||||
-----------------
|
||||
|
||||
`Headless Chromium <https://chromium.googlesource.com/chromium/src/+/master/headless/README.md>`_
|
||||
may optionally be used instead of Chromium or Chrome to run Brozzler without
|
||||
a visisble browser window or X11 server. At the time of writing
|
||||
``headless_shell`` is a separate Linux-only executable and must be compiled
|
||||
from source. Beware that compiling Chromium requires 10 GB of disk space,
|
||||
several GB of RAM and patience.
|
||||
|
||||
Start by installing the dependencies listed in Chromium's `Linux-specific build
|
||||
instructions <https://chromium.googlesource.com/chromium/src/+/master/docs/linux_build_instructions.md>`_.
|
||||
|
||||
Next install the build tools and fetch the source code:
|
||||
|
||||
::
|
||||
|
||||
mkdir -p ~/chromium
|
||||
cd ~/chromium
|
||||
git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git
|
||||
export $PATH=$PWD/depot_tools:$PATH
|
||||
fetch --no-history chromium --nosvn=True
|
||||
|
||||
Configure a headless release build (the debug builds are much larger):
|
||||
|
||||
::
|
||||
|
||||
cd src
|
||||
mkdir -p out/release
|
||||
echo 'import("//build/args/headless.gn")' > out/release/args.gn
|
||||
echo 'is_debug = false' >> out/release/args.gn
|
||||
gn gen out/release
|
||||
|
||||
Run the compile:
|
||||
|
||||
::
|
||||
|
||||
ninja -C out/release headless_shell
|
||||
|
||||
This will produce an ``out/release/headless_shell`` executable. Unfortunately
|
||||
this cannot be used with Brozzler as-is as the ``--window-size`` command-line
|
||||
option expects a different syntax in Headless Chromium. As a workaround create
|
||||
a wrapper shell script ``headless_chromium.sh`` which replaces the misbehaving
|
||||
option:
|
||||
|
||||
::
|
||||
|
||||
#!/bin/bash
|
||||
exec ~/chromium/src/out/release/headless_shell "${@//--window-size=1100,900/--window-size=1100x900}"
|
||||
|
||||
Run brozzler passing the path to the wrapper script as the ``--chrome-exe``
|
||||
option:
|
||||
|
||||
::
|
||||
|
||||
chmod +x ~/bin/headless_chromium.sh
|
||||
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
|
||||
|
||||
The Pepper Flash plugin ``libpepflashplayer.so`` from an official Google Chrome
|
||||
release may be used with Headless Chromium by adding this option to the wrapper
|
||||
script:
|
||||
|
||||
::
|
||||
|
||||
--register-pepper-plugins=/opt/google/chrome/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash
|
||||
|
||||
License
|
||||
-------
|
||||
|
||||
|
23
ansible/hosts-vagrant
Normal file
23
ansible/hosts-vagrant
Normal file
@ -0,0 +1,23 @@
|
||||
[all:vars]
|
||||
warcs_dir=/vagrant/warcs
|
||||
brozzler_pip_name='-e /brozzler'
|
||||
user=vagrant
|
||||
### possible values for a prod deployment
|
||||
# brozzler_pip_name=brozzler # get it from pypi
|
||||
# brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||
|
||||
[rethinkdb]
|
||||
10.9.9.9
|
||||
|
||||
[warcprox]
|
||||
work_dir=/vagrant
|
||||
10.9.9.9
|
||||
|
||||
[brozzler-worker]
|
||||
10.9.9.9
|
||||
|
||||
[brozzler-webconsole]
|
||||
10.9.9.9
|
||||
|
||||
[pywb]
|
||||
10.9.9.9
|
@ -1,9 +1,4 @@
|
||||
---
|
||||
- name: apply common configuration to all nodes
|
||||
hosts: all
|
||||
roles:
|
||||
- common
|
||||
|
||||
- name: deploy rethinkdb
|
||||
hosts: rethinkdb
|
||||
roles:
|
||||
@ -27,4 +22,4 @@
|
||||
- name: deploy pywb
|
||||
hosts: pywb
|
||||
roles:
|
||||
- pywb
|
||||
- pywb
|
3
ansible/roles/brozzler-webconsole/meta/main.yml
Normal file
3
ansible/roles/brozzler-webconsole/meta/main.yml
Normal file
@ -0,0 +1,3 @@
|
||||
---
|
||||
dependencies:
|
||||
- role: common
|
@ -1,9 +1,15 @@
|
||||
---
|
||||
- name: mkdir {{venv_root}}/brozzler-webconsole-ve34
|
||||
file: path={{venv_root}}/brozzler-webconsole-ve34 state=directory
|
||||
owner={{user}}
|
||||
become: true
|
||||
- name: install brozzler[webconsole] in virtualenv
|
||||
pip: name='-e /brozzler[webconsole]'
|
||||
virtualenv=/home/vagrant/brozzler-webconsole-ve34
|
||||
pip: name='{{brozzler_pip_name}}[webconsole]'
|
||||
virtualenv={{venv_root}}/brozzler-webconsole-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
become: true
|
||||
become_user: '{{user}}'
|
||||
notify:
|
||||
- restart brozzler-webconsole
|
||||
- name: install upstart config /etc/init/brozzler-webconsole.conf
|
@ -3,16 +3,16 @@ description "brozzler-webconsole"
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH=/home/vagrant/brozzler-webconsole-ve34/lib/python3.4/site-packages
|
||||
env PATH=/home/vagrant/brozzler-webconsole-ve34/bin:/usr/bin:/bin
|
||||
env PYTHONPATH={{venv_root}}/brozzler-webconsole-ve34/lib/python3.4/site-packages
|
||||
env PATH={{venv_root}}/brozzler-webconsole-ve34/bin:/usr/bin:/bin
|
||||
env LC_ALL=C.UTF-8
|
||||
|
||||
env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
|
||||
env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
|
||||
env RETHINKDB_DB=brozzler
|
||||
|
||||
setuid vagrant
|
||||
setuid {{user}}
|
||||
|
||||
# console log
|
||||
console log
|
||||
|
||||
exec gunicorn --bind=0.0.0.0:8881 brozzler.webconsole:app >>/vagrant/logs/brozzler-webconsole.log 2>&1
|
||||
exec gunicorn --bind=0.0.0.0:8881 brozzler.webconsole:app
|
3
ansible/roles/brozzler-worker/meta/main.yml
Normal file
3
ansible/roles/brozzler-worker/meta/main.yml
Normal file
@ -0,0 +1,3 @@
|
||||
---
|
||||
dependencies:
|
||||
- role: common
|
@ -3,60 +3,72 @@
|
||||
apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
|
||||
state=present
|
||||
become: true
|
||||
- apt: update_cache=yes
|
||||
become: true
|
||||
- name: ensure required packages are installed
|
||||
become: true
|
||||
apt: name={{item}} state=present
|
||||
with_items:
|
||||
- python-virtualenv
|
||||
- vnc4server
|
||||
- chromium-browser
|
||||
- xfonts-base
|
||||
- fonts-arphic-bkai00mp
|
||||
- fonts-arphic-bsmi00lp
|
||||
- fonts-arphic-gbsn00lp
|
||||
- fonts-arphic-gkai00mp
|
||||
- fonts-arphic-ukai
|
||||
- fonts-farsiweb
|
||||
- fonts-nafees
|
||||
- fonts-sil-abyssinica
|
||||
- fonts-sil-ezra
|
||||
- fonts-sil-padauk
|
||||
- fonts-unfonts-extra
|
||||
- fonts-unfonts-core
|
||||
- ttf-indic-fonts
|
||||
- fonts-thai-tlwg
|
||||
- fonts-lklug-sinhala
|
||||
- git
|
||||
- libjpeg-turbo8-dev
|
||||
- zlib1g-dev
|
||||
- gcc
|
||||
- libpython3.4-dev
|
||||
- adobe-flashplugin
|
||||
- python-virtualenv
|
||||
- vnc4server
|
||||
- chromium-browser
|
||||
- xfonts-base
|
||||
- fonts-arphic-bkai00mp
|
||||
- fonts-arphic-bsmi00lp
|
||||
- fonts-arphic-gbsn00lp
|
||||
- fonts-arphic-gkai00mp
|
||||
- fonts-arphic-ukai
|
||||
- fonts-farsiweb
|
||||
- fonts-nafees
|
||||
- fonts-sil-abyssinica
|
||||
- fonts-sil-ezra
|
||||
- fonts-sil-padauk
|
||||
- fonts-unfonts-extra
|
||||
- fonts-unfonts-core
|
||||
- ttf-indic-fonts
|
||||
- fonts-thai-tlwg
|
||||
- fonts-lklug-sinhala
|
||||
- git
|
||||
- libjpeg-turbo8-dev
|
||||
- zlib1g-dev
|
||||
- gcc
|
||||
- libpython3.4-dev
|
||||
- adobe-flashplugin
|
||||
- name: install Xvnc upstart config /etc/init/Xvnc.conf
|
||||
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart Xvnc
|
||||
- restart Xvnc
|
||||
- name: mkdir {{venv_root}}/websockify-ve34
|
||||
become: true
|
||||
file: path={{venv_root}}/websockify-ve34 state=directory owner={{user}}
|
||||
- name: install websockify in virtualenv
|
||||
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
|
||||
virtualenv=/home/vagrant/websockify-ve34
|
||||
virtualenv={{venv_root}}/websockify-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
become: true
|
||||
become_user: '{{user}}'
|
||||
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
|
||||
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart vnc-websock
|
||||
- restart vnc-websock
|
||||
- name: mkdir {{venv_root}}/brozzler-ve34
|
||||
become: true
|
||||
file: path={{venv_root}}/brozzler-ve34 state=directory owner={{user}}
|
||||
- name: install brozzler in virtualenv
|
||||
pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||
name='-e /brozzler'
|
||||
virtualenv=/home/vagrant/brozzler-ve34
|
||||
name='{{brozzler_pip_name}}'
|
||||
virtualenv={{venv_root}}/brozzler-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
become: true
|
||||
become_user: '{{user}}'
|
||||
notify:
|
||||
- restart brozzler-worker
|
||||
- restart brozzler-worker
|
||||
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
|
||||
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart brozzler-worker
|
||||
- restart brozzler-worker
|
@ -3,11 +3,11 @@ description "Xvnc"
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
setuid vagrant
|
||||
setuid {{user}}
|
||||
|
||||
console log
|
||||
|
||||
exec nice Xvnc4 :1 -auth /tmp/Xauthority.vagrant \
|
||||
exec nice Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
|
||||
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
|
||||
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
|
||||
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0
|
@ -4,13 +4,13 @@ start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env DISPLAY=:1
|
||||
env PATH=/home/vagrant/brozzler-ve34/bin:/usr/bin:/bin
|
||||
env PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages
|
||||
env PATH={{venv_root}}/brozzler-ve34/bin:/usr/bin:/bin
|
||||
env PYTHONPATH={{venv_root}}/brozzler-ve34/lib/python3.4/site-packages
|
||||
env LANG=C.UTF-8
|
||||
|
||||
setuid vagrant
|
||||
setuid {{user}}
|
||||
|
||||
# console log
|
||||
console log
|
||||
|
||||
# depends on vnc server
|
||||
start on started Xvnc
|
||||
@ -20,4 +20,4 @@ kill timeout 60
|
||||
|
||||
exec nice brozzler-worker \
|
||||
--rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
||||
--max-browsers=4 >>/vagrant/logs/brozzler-worker.log 2>&1
|
||||
--max-browsers=4
|
@ -3,12 +3,12 @@ description "vnc-websock"
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
setuid vagrant
|
||||
setuid {{user}}
|
||||
|
||||
console log
|
||||
|
||||
env PYTHONPATH=/home/vagrant/websockify-ve34/lib/python3.4/site-packages
|
||||
env PATH=/home/vagrant/websockify-ve34/bin:/usr/bin:/bin
|
||||
env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages
|
||||
env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin
|
||||
|
||||
# port 8901 is hard-coded in brozzler/webconsole/static/partials/workers.html
|
||||
exec nice websockify 0.0.0.0:8901 localhost:5901
|
7
ansible/roles/common/defaults/main.yml
Normal file
7
ansible/roles/common/defaults/main.yml
Normal file
@ -0,0 +1,7 @@
|
||||
# variables default values, these can be overridden in the ansible inventory
|
||||
# or various other places
|
||||
---
|
||||
user: brozzler
|
||||
venv_root: /opt
|
||||
warcs_dir: /var/tmp/warcs
|
||||
brozzler_pip_name: brozzler # get it from pypi by default
|
@ -19,6 +19,12 @@
|
||||
## command: python3 setup.py install chdir=/tmp/pip-8.1.2
|
||||
## creates=/usr/local/lib/python2.7/dist-packages/pip-8.1.2-py2.7.egg/pip/__init__.py
|
||||
## become: true
|
||||
- name: mkdir /vagrant/logs
|
||||
file: path=/vagrant/logs state=directory
|
||||
- command: id {{user}}
|
||||
register: id_user
|
||||
ignore_errors: true
|
||||
changed_when: false
|
||||
- name: ensure service user {{user}} exists
|
||||
user: name={{user}} system=yes createhome=no home=/nonexistent
|
||||
shell=/usr/sbin/nologin
|
||||
become: true
|
||||
when: id_user|failed
|
3
ansible/roles/pywb/meta/main.yml
Normal file
3
ansible/roles/pywb/meta/main.yml
Normal file
@ -0,0 +1,3 @@
|
||||
---
|
||||
dependencies:
|
||||
- role: common
|
@ -1,16 +1,24 @@
|
||||
---
|
||||
- name: mkdir {{venv_root}}/pywb-ve34
|
||||
file: path={{venv_root}}/pywb-ve34 state=directory
|
||||
owner={{user}}
|
||||
become: true
|
||||
- name: install pywb in virtualenv
|
||||
pip: name=pywb
|
||||
virtualenv=/home/vagrant/pywb-ve34
|
||||
virtualenv={{venv_root}}/pywb-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
become: true
|
||||
become_user: '{{user}}'
|
||||
notify:
|
||||
- restart pywb
|
||||
- name: install brozzler in pywb virtualenv
|
||||
pip: name='-e /brozzler'
|
||||
virtualenv=/home/vagrant/pywb-ve34
|
||||
pip: name='{{brozzler_pip_name}}'
|
||||
virtualenv={{venv_root}}/pywb-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
become: true
|
||||
become_user: '{{user}}'
|
||||
notify:
|
||||
- restart pywb
|
||||
- name: pywb config file /etc/pywb.yml
|
14
ansible/roles/pywb/templates/pywb.conf.j2
Normal file
14
ansible/roles/pywb/templates/pywb.conf.j2
Normal file
@ -0,0 +1,14 @@
|
||||
description "pywb"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH={{venv_root}}/pywb-ve34/lib/python3.4/site-packages
|
||||
env PATH={{venv_root}}/pywb-ve34/bin:/usr/bin:/bin
|
||||
env PYWB_CONFIG_FILE=/etc/pywb.yml
|
||||
|
||||
setuid {{user}}
|
||||
|
||||
console log
|
||||
|
||||
exec nice brozzler-wayback
|
@ -1,10 +1,13 @@
|
||||
archive_paths: /vagrant/warcs/
|
||||
archive_paths: {{warcs_dir}}/ # pywb will fail without a trailing slash
|
||||
collections:
|
||||
brozzler:
|
||||
index_paths: !!python/object:brozzler.pywb.RethinkCDXSource
|
||||
db: brozzler
|
||||
servers: [localhost]
|
||||
table: captures
|
||||
servers:
|
||||
{% for node in groups['rethinkdb'] %}
|
||||
- {{node}}
|
||||
{% endfor %}
|
||||
enable_auto_colls: false
|
||||
enable_cdx_api: true
|
||||
framed_replay: true
|
@ -6,18 +6,24 @@
|
||||
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
|
||||
state=present
|
||||
become: true
|
||||
- apt: update_cache=yes
|
||||
become: true
|
||||
- name: ensure rethinkdb package is installed
|
||||
apt: name=rethinkdb state=present
|
||||
become: true
|
||||
notify:
|
||||
- restart rethinkdb
|
||||
# XXX rethinkdb fails to start in spite of this, I think because /vagrant
|
||||
# gets mounted too late, and it tries to log there
|
||||
- name: ensure rethinkdb starts on reboot
|
||||
service: name=rethinkdb enabled=yes
|
||||
- stat: path=/var/log/rethinkdb.log
|
||||
register: p
|
||||
- name: ensure user rethinkdb owns /var/log/rethinkdb.log
|
||||
file: path=/var/log/rethinkdb.log owner=rethinkdb state=touch mode=0644
|
||||
when: not p.stat.exists
|
||||
become: true
|
||||
- name: ensure rethinkdb instance config file is installed
|
||||
template: src=templates/rethinkdb-brozzler-vagrant-1.conf.j2
|
||||
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-vagrant-1.conf
|
||||
template: src=templates/rethinkdb-brozzler.conf.j2
|
||||
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart rethinkdb
|
@ -0,0 +1,6 @@
|
||||
bind=0.0.0.0
|
||||
# directory=/var/lib/rethinkdb
|
||||
log-file=/var/log/rethinkdb.log
|
||||
{% for node in groups['rethinkdb'] %}
|
||||
join={{node}}:29015
|
||||
{% endfor %}
|
2
ansible/roles/warcprox/defaults/main.yml
Normal file
2
ansible/roles/warcprox/defaults/main.yml
Normal file
@ -0,0 +1,2 @@
|
||||
---
|
||||
work_dir: /var/tmp
|
4
ansible/roles/warcprox/handlers/main.yml
Normal file
4
ansible/roles/warcprox/handlers/main.yml
Normal file
@ -0,0 +1,4 @@
|
||||
---
|
||||
- name: restart warcprox
|
||||
service: name=warcprox state=restarted
|
||||
become: true
|
3
ansible/roles/warcprox/meta/main.yml
Normal file
3
ansible/roles/warcprox/meta/main.yml
Normal file
@ -0,0 +1,3 @@
|
||||
---
|
||||
dependencies:
|
||||
- role: common
|
@ -11,11 +11,16 @@
|
||||
- libssl-dev
|
||||
- tor
|
||||
- git
|
||||
- name: mkdir {{venv_root}}/warcprox-ve34
|
||||
become: true
|
||||
file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}}
|
||||
- name: install warcprox in virtualenv
|
||||
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
|
||||
virtualenv=/home/vagrant/warcprox-ve34
|
||||
virtualenv={{venv_root}}/warcprox-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
become: true
|
||||
become_user: '{{user}}'
|
||||
notify:
|
||||
- restart warcprox
|
||||
- name: install upstart config /etc/init/warcprox.conf
|
24
ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
24
ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
@ -0,0 +1,24 @@
|
||||
description "warcprox"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH={{venv_root}}/warcprox-ve34/lib/python3.4/site-packages
|
||||
env PATH={{venv_root}}/warcprox-ve34/bin:/usr/bin:/bin
|
||||
|
||||
# by default warcprox creates some files/dirs relative to cwd
|
||||
chdir {{work_dir}}
|
||||
setuid {{user}}
|
||||
|
||||
console log
|
||||
|
||||
# --profile
|
||||
exec nice warcprox \
|
||||
--dir={{warcs_dir}} \
|
||||
--base32 \
|
||||
--gzip \
|
||||
--rollover-idle-time=180 \
|
||||
--onion-tor-socks-proxy=localhost:9050 \
|
||||
--rethinkdb-servers={{groups['rethinkdb']|join(',')}} \
|
||||
--rethinkdb-db=brozzler \
|
||||
--rethinkdb-big-table
|
@ -186,7 +186,7 @@ class Browser:
|
||||
cookie_location = os.path.join(
|
||||
self._work_dir.name, "chrome-user-data", "Default", "Cookies")
|
||||
self.logger.debug(
|
||||
"marking cookies persistent then reading file into memory: %s ",
|
||||
"marking cookies persistent then reading file into memory: %s",
|
||||
cookie_location)
|
||||
try:
|
||||
with sqlite3.connect(cookie_location) as conn:
|
||||
@ -230,6 +230,7 @@ class Browser:
|
||||
|
||||
def browse_page(
|
||||
self, url, extra_headers=None, behavior_parameters=None,
|
||||
user_agent=None,
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
on_url_change=None):
|
||||
"""
|
||||
@ -244,6 +245,7 @@ class Browser:
|
||||
raise BrowsingException("browser has not been started")
|
||||
self.url = url
|
||||
self.extra_headers = extra_headers
|
||||
self.user_agent = user_agent
|
||||
self.on_request = on_request
|
||||
self.on_screenshot = on_screenshot
|
||||
self.on_url_change = on_url_change
|
||||
@ -459,11 +461,14 @@ __brzl_compileOutlinks(window).join(' ');
|
||||
self.send_to_chrome(method="Runtime.enable")
|
||||
|
||||
headers = self.extra_headers or {}
|
||||
headers['Accept-Encoding'] = 'gzip, deflate'
|
||||
headers['Accept-Encoding'] = 'identity'
|
||||
self.send_to_chrome(
|
||||
method="Network.setExtraHTTPHeaders",
|
||||
params={"headers":headers})
|
||||
|
||||
if self.user_agent:
|
||||
self.send_to_chrome(method="Network.setUserAgentOverride", params={"userAgent": self.user_agent})
|
||||
|
||||
# disable google analytics, see _handle_message() where breakpoint is caught "Debugger.paused"
|
||||
self.send_to_chrome(method="Debugger.setBreakpointByUrl", params={"lineNumber": 1, "urlRegex":"https?://www.google-analytics.com/analytics.js"})
|
||||
|
||||
|
@ -86,7 +86,7 @@ def _configure_logging(args):
|
||||
warnings.simplefilter(
|
||||
'ignore', category=requests.packages.urllib3.exceptions.InsecurePlatformWarning)
|
||||
|
||||
def suggest_default_chome_exe():
|
||||
def suggest_default_chrome_exe():
|
||||
# mac os x application executable paths
|
||||
for path in [
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
@ -118,7 +118,7 @@ def brozzle_page():
|
||||
arg_parser.add_argument('url', metavar='URL', help='page url')
|
||||
arg_parser.add_argument(
|
||||
'-e', '--chrome-exe', dest='chrome_exe',
|
||||
default=suggest_default_chome_exe(),
|
||||
default=suggest_default_chrome_exe(),
|
||||
help='executable to use to invoke chrome')
|
||||
arg_parser.add_argument(
|
||||
'--proxy', dest='proxy', default=None,
|
||||
@ -182,7 +182,12 @@ def brozzler_new_job():
|
||||
r = rethinkstuff.Rethinker(
|
||||
args.rethinkdb_servers.split(','), args.rethinkdb_db)
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
||||
try:
|
||||
brozzler.job.new_job_file(frontier, args.job_conf_file)
|
||||
except brozzler.job.InvalidJobConf as e:
|
||||
print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr)
|
||||
print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
def brozzler_new_site():
|
||||
'''
|
||||
@ -238,7 +243,7 @@ def brozzler_worker():
|
||||
_add_rethinkdb_options(arg_parser)
|
||||
arg_parser.add_argument(
|
||||
'-e', '--chrome-exe', dest='chrome_exe',
|
||||
default=suggest_default_chome_exe(),
|
||||
default=suggest_default_chrome_exe(),
|
||||
help='executable to use to invoke chrome')
|
||||
arg_parser.add_argument(
|
||||
'-n', '--max-browsers', dest='max_browsers', default='1',
|
||||
|
@ -87,7 +87,7 @@ def _build_arg_parser(prog=os.path.basename(sys.argv[0])):
|
||||
# brozzler-worker args
|
||||
arg_parser.add_argument(
|
||||
'-e', '--chrome-exe', dest='chrome_exe',
|
||||
default=brozzler.cli.suggest_default_chome_exe(),
|
||||
default=brozzler.cli.suggest_default_chrome_exe(),
|
||||
help='executable to use to invoke chrome')
|
||||
arg_parser.add_argument(
|
||||
'-n', '--max-browsers', dest='max_browsers',
|
||||
|
@ -241,6 +241,15 @@ class RethinkDbFrontier:
|
||||
else:
|
||||
return None
|
||||
|
||||
def site(self, id):
|
||||
if id is None:
|
||||
return None
|
||||
result = self.r.table("sites").get(id).run()
|
||||
if result:
|
||||
return brozzler.Site(**result)
|
||||
else:
|
||||
return None
|
||||
|
||||
def honor_stop_request(self, job_id):
|
||||
"""Raises brozzler.CrawlJobStopped if stop has been requested."""
|
||||
job = self.job(job_id)
|
||||
|
@ -24,6 +24,28 @@ import json
|
||||
import datetime
|
||||
import uuid
|
||||
import rethinkstuff
|
||||
import os
|
||||
import cerberus
|
||||
import urllib
|
||||
|
||||
def load_schema():
|
||||
schema_file = os.path.join(os.path.dirname(__file__), 'job_schema.yaml')
|
||||
with open(schema_file) as f:
|
||||
return yaml.load(f)
|
||||
|
||||
class JobValidator(cerberus.Validator):
|
||||
def _validate_type_url(self, value):
|
||||
url = urllib.parse.urlparse(value)
|
||||
return url.scheme in ('http', 'https', 'ftp')
|
||||
|
||||
class InvalidJobConf(Exception):
|
||||
def __init__(self, errors):
|
||||
self.errors = errors
|
||||
|
||||
def validate_conf(job_conf, schema=load_schema()):
|
||||
v = JobValidator(schema)
|
||||
if not v.validate(job_conf):
|
||||
raise InvalidJobConf(v.errors)
|
||||
|
||||
def merge(a, b):
|
||||
if isinstance(a, dict) and isinstance(b, dict):
|
||||
@ -45,6 +67,7 @@ def new_job_file(frontier, job_conf_file):
|
||||
new_job(frontier, job_conf)
|
||||
|
||||
def new_job(frontier, job_conf):
|
||||
validate_conf(job_conf)
|
||||
job = Job(
|
||||
id=job_conf.get("id"), conf=job_conf, status="ACTIVE",
|
||||
started=rethinkstuff.utcnow())
|
||||
@ -52,8 +75,6 @@ def new_job(frontier, job_conf):
|
||||
sites = []
|
||||
for seed_conf in job_conf["seeds"]:
|
||||
merged_conf = merge(seed_conf, job_conf)
|
||||
# XXX check for unknown settings, invalid url, etc
|
||||
|
||||
site = brozzler.Site(
|
||||
job_id=job.id, seed=merged_conf["url"],
|
||||
scope=merged_conf.get("scope"),
|
||||
@ -64,7 +85,8 @@ def new_job(frontier, job_conf):
|
||||
"enable_warcprox_features"),
|
||||
warcprox_meta=merged_conf.get("warcprox_meta"),
|
||||
metadata=merged_conf.get("metadata"),
|
||||
remember_outlinks=merged_conf.get("remember_outlinks"))
|
||||
remember_outlinks=merged_conf.get("remember_outlinks"),
|
||||
user_agent=merged_conf.get("user_agent"))
|
||||
sites.append(site)
|
||||
|
||||
# insert all the sites into database before the job
|
||||
|
82
brozzler/job_schema.yaml
Normal file
82
brozzler/job_schema.yaml
Normal file
@ -0,0 +1,82 @@
|
||||
id:
|
||||
type: string
|
||||
required: true
|
||||
|
||||
<<: &multi_level_options
|
||||
time_limit:
|
||||
type: number
|
||||
min: 0
|
||||
|
||||
enable_warcprox_features:
|
||||
type: boolean
|
||||
|
||||
ignore_robots:
|
||||
type: boolean
|
||||
|
||||
warcprox_meta:
|
||||
type: dict
|
||||
nullable: true
|
||||
|
||||
proxy:
|
||||
type: string
|
||||
nullable: true
|
||||
|
||||
scope:
|
||||
type: dict
|
||||
schema:
|
||||
surt:
|
||||
type: string
|
||||
|
||||
accepts:
|
||||
type: list
|
||||
schema: &scope_rule
|
||||
type: dict
|
||||
schema:
|
||||
|
||||
domain:
|
||||
type: string
|
||||
|
||||
url_match:
|
||||
type: string
|
||||
allowed:
|
||||
- STRING_MATCH
|
||||
- SURT_MATCH
|
||||
- REGEX_MATCH
|
||||
|
||||
value:
|
||||
type: string
|
||||
dependencies:
|
||||
- url_match
|
||||
|
||||
blocks:
|
||||
type: list
|
||||
schema: *scope_rule
|
||||
|
||||
max_hops:
|
||||
type: integer
|
||||
|
||||
max_hops_off_surt:
|
||||
type: integer
|
||||
|
||||
remember_outlinks:
|
||||
type: boolean
|
||||
|
||||
metadata:
|
||||
type: dict
|
||||
|
||||
user_agent:
|
||||
type: string
|
||||
|
||||
seeds:
|
||||
type: list
|
||||
required: true
|
||||
minlength: 1
|
||||
schema:
|
||||
type: dict
|
||||
schema:
|
||||
|
||||
url:
|
||||
type: url
|
||||
required: true
|
||||
|
||||
<<: *multi_level_options
|
@ -63,13 +63,16 @@ class RethinkCDXSource(pywb.cdx.cdxsource.CDXSource):
|
||||
# short-circuit this step and create the CDXObject directly
|
||||
blob = {
|
||||
'url': record['url'],
|
||||
'mime': record['content_type'],
|
||||
'status': str(record['response_code']),
|
||||
'digest': record['sha1base32'],
|
||||
'length': str(record['length']), # XXX is this the right length?
|
||||
'offset': str(record['offset']),
|
||||
'filename': record['filename'],
|
||||
}
|
||||
if record['warc_type'] != 'revisit':
|
||||
blob['mime'] = record['content_type']
|
||||
else:
|
||||
blob['mime'] = 'warc/revisit'
|
||||
# b'org,archive)/ 20160427215530 {"url": "https://archive.org/", "mime": "text/html", "status": "200", "digest": "VILUFXZD232SLUA6XROZQIMEVUPW6EIE", "length": "16001", "offset": "90144", "filename": "ARCHIVEIT-261-ONE_TIME-JOB209607-20160427215508135-00000.warc.gz"}'
|
||||
cdx_line = '{} {:%Y%m%d%H%M%S} {}'.format(
|
||||
record['canon_surt'], record['timestamp'],
|
||||
|
@ -42,6 +42,8 @@ def _robots_cache(site):
|
||||
req_sesh.proxies = {"http":proxie,"https":proxie}
|
||||
if site.extra_headers():
|
||||
req_sesh.headers.update(site.extra_headers())
|
||||
if site.user_agent:
|
||||
req_sesh.headers['User-Agent'] = site.user_agent
|
||||
_robots_caches[site.id] = reppy.cache.RobotsCache(session=req_sesh)
|
||||
|
||||
return _robots_caches[site.id]
|
||||
|
@ -91,7 +91,8 @@ class Site(brozzler.BaseDictable):
|
||||
enable_warcprox_features=False, reached_limit=None,
|
||||
status="ACTIVE", claimed=False, start_time=None,
|
||||
last_disclaimed=_EPOCH_UTC, last_claimed_by=None,
|
||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None, cookie_db=None):
|
||||
last_claimed=_EPOCH_UTC, metadata={}, remember_outlinks=None,
|
||||
cookie_db=None, user_agent=None):
|
||||
|
||||
self.seed = seed
|
||||
self.id = id
|
||||
@ -111,6 +112,7 @@ class Site(brozzler.BaseDictable):
|
||||
self.metadata = metadata
|
||||
self.remember_outlinks = remember_outlinks
|
||||
self.cookie_db = cookie_db
|
||||
self.user_agent = user_agent
|
||||
|
||||
self.scope = scope or {}
|
||||
if not "surt" in self.scope:
|
||||
|
@ -260,6 +260,7 @@ class BrozzlerWorker:
|
||||
browser.start(proxy=self._proxy(site), cookie_db=site.cookie_db)
|
||||
outlinks = browser.browse_page(
|
||||
page.url, extra_headers=site.extra_headers(),
|
||||
user_agent=site.user_agent,
|
||||
on_screenshot=_on_screenshot,
|
||||
on_url_change=page.note_redirect)
|
||||
return outlinks
|
||||
@ -312,7 +313,8 @@ class BrozzlerWorker:
|
||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||
socket.gethostname(), browser.chrome_port))
|
||||
outlinks = self.brozzle_page(browser, site, page)
|
||||
site.cookie_db=browser.persist_and_read_cookie_db()
|
||||
if browser.is_running():
|
||||
site.cookie_db = browser.persist_and_read_cookie_db()
|
||||
self._frontier.completed_page(site, page)
|
||||
self._frontier.scope_and_schedule_outlinks(
|
||||
site, page, outlinks)
|
||||
|
12
job-conf.rst
12
job-conf.rst
@ -168,6 +168,18 @@ ignore_robots
|
||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||
blocked by robots.txt rules.
|
||||
|
||||
user_agent
|
||||
----------
|
||||
+-----------------------+---------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+=========+
|
||||
| seed-level, top-level | string | no | *none* |
|
||||
+-----------------------+---------+----------+---------+
|
||||
The ``User-Agent`` header brozzler will send to identify itself to web servers.
|
||||
It's good ettiquette to include a project URL with a notice to webmasters that
|
||||
explains why you're crawling, how to block the crawler robots.txt and how to
|
||||
contact the operator if the crawl is causing problems.
|
||||
|
||||
warcprox_meta
|
||||
-------------
|
||||
+-----------------------+------------+----------+---------+
|
||||
|
11
setup.py
11
setup.py
@ -32,16 +32,16 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b6.dev87',
|
||||
version='1.1b7.dev101',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
author_email='nlevitt@archive.org',
|
||||
long_description=open('README.rst', encoding='UTF-8').read(),
|
||||
long_description=open('README.rst', mode='rb').read().decode('UTF-8'),
|
||||
license='Apache License 2.0',
|
||||
packages=['brozzler', 'brozzler.webconsole'],
|
||||
package_data={
|
||||
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml'],
|
||||
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
|
||||
'brozzler.webconsole': find_package_data('brozzler.webconsole'),
|
||||
},
|
||||
entry_points={
|
||||
@ -62,11 +62,12 @@ setuptools.setup(
|
||||
'reppy',
|
||||
'requests',
|
||||
'websocket-client',
|
||||
'pillow',
|
||||
'pillow==3.3.0',
|
||||
'surt>=0.3.0',
|
||||
'rethinkstuff>=0.1.5',
|
||||
'rethinkdb>=2.3,<2.4',
|
||||
'psutil',
|
||||
'psutil==4.3.0',
|
||||
'cerberus==1.0.1',
|
||||
],
|
||||
extras_require={
|
||||
'webconsole': ['flask>=0.11', 'gunicorn'],
|
||||
|
@ -1,8 +1,7 @@
|
||||
#!/usr/bin/env python
|
||||
'''
|
||||
cluster-integration-tests.py - integration tests for a brozzler cluster,
|
||||
expects brozzler, warcprox, pywb, rethinkdb and other dependencies to be
|
||||
running already
|
||||
test_cluster.py - integration tests for a brozzler cluster, expects brozzler,
|
||||
warcprox, pywb, rethinkdb and other dependencies to be running already
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
@ -26,6 +25,10 @@ import urllib.request
|
||||
import os
|
||||
import socket
|
||||
import rethinkstuff
|
||||
import time
|
||||
import brozzler
|
||||
import datetime
|
||||
import requests
|
||||
|
||||
@pytest.fixture(scope='module')
|
||||
def httpd(request):
|
||||
@ -53,13 +56,13 @@ def test_httpd(httpd):
|
||||
'''
|
||||
payload1 = content2 = None
|
||||
with urllib.request.urlopen(
|
||||
'http://localhost:%s/' % httpd.server_port) as response:
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
||||
assert response.status == 200
|
||||
payload1 = response.read()
|
||||
assert payload1
|
||||
|
||||
with urllib.request.urlopen(
|
||||
'http://localhost:%s/' % httpd.server_port) as response:
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port) as response:
|
||||
assert response.status == 200
|
||||
payload2 = response.read()
|
||||
assert payload2
|
||||
@ -68,21 +71,71 @@ def test_httpd(httpd):
|
||||
|
||||
def test_services_up():
|
||||
'''Check that the expected services are up and running.'''
|
||||
# check that warcprox is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8000))
|
||||
|
||||
### # check that pywb is listening
|
||||
### with socket.socket() as s:
|
||||
### # if the connect fails an exception is raised and the test fails
|
||||
### s.connect(('localhost', 8880))
|
||||
|
||||
# check that rethinkdb is listening and looks sane
|
||||
r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db
|
||||
tbls = r.table_list().run()
|
||||
assert len(tbls) > 10
|
||||
|
||||
def test_brozzle_site(httpd):
|
||||
pass
|
||||
# check that warcprox is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8000))
|
||||
|
||||
# check that pywb is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8880))
|
||||
|
||||
# check that brozzler webconsole is listening
|
||||
with socket.socket() as s:
|
||||
# if the connect fails an exception is raised and the test fails
|
||||
s.connect(('localhost', 8881))
|
||||
|
||||
def test_brozzle_site(httpd):
|
||||
test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
|
||||
site = brozzler.Site(
|
||||
seed='http://localhost:%s/' % httpd.server_port,
|
||||
proxy='localhost:8000', enable_warcprox_features=True,
|
||||
warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})
|
||||
|
||||
# the two pages we expect to be crawled
|
||||
page1 = 'http://localhost:%s/' % httpd.server_port
|
||||
page2 = 'http://localhost:%s/file1.txt' % httpd.server_port
|
||||
|
||||
assert site.id is None
|
||||
r = rethinkstuff.Rethinker('localhost', db='brozzler')
|
||||
frontier = brozzler.RethinkDbFrontier(r)
|
||||
brozzler.new_site(frontier, site)
|
||||
assert site.id is not None
|
||||
assert len(list(frontier.site_pages(site.id))) == 1
|
||||
|
||||
# the site should be brozzled fairly quickly
|
||||
start = time.time()
|
||||
while site.status != 'FINISHED' and time.time() - start < 300:
|
||||
time.sleep(0.5)
|
||||
site = frontier.site(site.id)
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
# check that we got the two pages we expected
|
||||
pages = list(frontier.site_pages(site.id))
|
||||
assert len(pages) == 2
|
||||
assert {page.url for page in pages} == {
|
||||
'http://localhost:%s/' % httpd.server_port,
|
||||
'http://localhost:%s/file1.txt' % httpd.server_port }
|
||||
|
||||
# take a look at the captures table
|
||||
captures = r.table('captures').filter({'test_id':test_id}).run()
|
||||
captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
|
||||
assert page1 in captures_by_url
|
||||
assert '%srobots.txt' % page1 in captures_by_url
|
||||
assert page2 in captures_by_url
|
||||
assert 'screenshot:%s' % page1 in captures_by_url
|
||||
assert 'thumbnail:%s' % page1 in captures_by_url
|
||||
# no screenshots of plaintext
|
||||
|
||||
# check pywb
|
||||
t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
|
||||
wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
|
||||
expected_payload = open(os.path.join(
|
||||
os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
|
||||
assert requests.get(wb_url).content == expected_payload
|
||||
|
4
vagrant/Vagrantfile
vendored
4
vagrant/Vagrantfile
vendored
@ -7,7 +7,7 @@ Vagrant.configure(2) do |config|
|
||||
config.vm.synced_folder "..", "/brozzler"
|
||||
|
||||
config.vm.provision "ansible" do |ansible|
|
||||
ansible.inventory_path = "ansible/hosts"
|
||||
ansible.playbook = "ansible/playbook.yml"
|
||||
ansible.inventory_path = "../ansible/hosts-vagrant"
|
||||
ansible.playbook = "../ansible/playbook.yml"
|
||||
end
|
||||
end
|
||||
|
@ -1,16 +0,0 @@
|
||||
ansible_ssh_private_key_file=.vagrant/machines/10.9.9.9/virtualbox/private_key
|
||||
|
||||
[rethinkdb]
|
||||
10.9.9.9
|
||||
|
||||
[warcprox]
|
||||
10.9.9.9
|
||||
|
||||
[brozzler-worker]
|
||||
10.9.9.9
|
||||
|
||||
[brozzler-webconsole]
|
||||
10.9.9.9
|
||||
|
||||
[pywb]
|
||||
10.9.9.9
|
@ -1,14 +0,0 @@
|
||||
description "pywb"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH=/home/vagrant/pywb-ve34/lib/python3.4/site-packages
|
||||
env PATH=/home/vagrant/pywb-ve34/bin:/usr/bin:/bin
|
||||
env PYWB_CONFIG_FILE=/etc/pywb.yml
|
||||
|
||||
setuid vagrant
|
||||
|
||||
# console log
|
||||
|
||||
exec nice brozzler-wayback >>/vagrant/logs/pywb.log 2>&1
|
@ -1,5 +0,0 @@
|
||||
runuser=vagrant
|
||||
bind=0.0.0.0
|
||||
# directory=/var/lib/rethinkdb
|
||||
# log-file=/var/log/rethinkdb.log
|
||||
log-file=/vagrant/logs/rethinkdb.log # synced dir
|
@ -1,14 +0,0 @@
|
||||
---
|
||||
# - name: start warcprox
|
||||
# environment:
|
||||
# PYTHONPATH: /home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
||||
# PATH: /home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
||||
# args:
|
||||
# executable: /bin/bash
|
||||
# shell: nice warcprox --dir=/vagrant/warcs --base32 --gzip
|
||||
# --rollover-idle-time=180 --cacert=/vagrant/warcprox-ca.pem
|
||||
# --onion-tor-socks-proxy=localhost:9050 --rethinkdb-servers=localhost
|
||||
# --rethinkdb-big-table &> /vagrant/logs/warcprox.out &
|
||||
- name: restart warcprox
|
||||
service: name=warcprox state=restarted
|
||||
become: true
|
@ -1,26 +0,0 @@
|
||||
description "warcprox"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH=/home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
||||
env PATH=/home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
||||
|
||||
# by default warcprox creates some files/dirs relative to cwd
|
||||
chdir /home/vagrant
|
||||
setuid vagrant
|
||||
|
||||
# console log
|
||||
|
||||
# --profile
|
||||
exec nice warcprox \
|
||||
--dir=/vagrant/warcs \
|
||||
--base32 \
|
||||
--gzip \
|
||||
--rollover-idle-time=180 \
|
||||
--cacert=/vagrant/warcprox-ca.pem \
|
||||
--onion-tor-socks-proxy=localhost:9050 \
|
||||
--rethinkdb-servers=localhost \
|
||||
--rethinkdb-db=brozzler \
|
||||
--rethinkdb-big-table >>/vagrant/logs/warcprox.log 2>&1
|
||||
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
@ -1,5 +1,7 @@
|
||||
#!/bin/bash
|
||||
|
||||
cd $(dirname "${BASH_SOURCE[0]}")
|
||||
|
||||
echo service status:
|
||||
vagrant ssh -- 'status warcprox ;
|
||||
status Xvnc ;
|
||||
@ -8,5 +10,5 @@ vagrant ssh -- 'status warcprox ;
|
||||
status vnc-websock'
|
||||
echo
|
||||
|
||||
vagrant ssh -- 'source brozzler-ve34/bin/activate && pip install pytest'
|
||||
vagrant ssh -- 'source brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
|
||||
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && pip install pytest'
|
||||
vagrant ssh -- 'source /opt/brozzler-ve34/bin/activate && py.test -v -s /brozzler/tests'
|
||||
|
@ -6,6 +6,20 @@ queue a job for your vagrant brozzler deployment.
|
||||
This is a standalone script with no dependencies other than python, and should
|
||||
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||
so we can use the argparse library.
|
||||
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
'''
|
||||
|
||||
import sys
|
||||
@ -20,23 +34,17 @@ def main(argv=[]):
|
||||
help='brozzler job configuration file in yaml')
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
|
||||
with open(args.job_conf_file, 'rb') as f:
|
||||
yaml_bytes = f.read()
|
||||
subprocess.call(
|
||||
['vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f'],
|
||||
stdin=yaml_bytes)
|
||||
|
||||
# cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
os.chdir(os.path.dirname(__file__))
|
||||
|
||||
with open(args.job_conf_file, 'rb') as f:
|
||||
subprocess.call([
|
||||
'vagrant', 'ssh', '--',
|
||||
'f=`mktemp` && cat > $f && '
|
||||
'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
|
||||
'/home/vagrant/brozzler-ve34/bin/python '
|
||||
'/home/vagrant/brozzler-ve34/bin/brozzler-new-job $f'],
|
||||
stdin=f)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv)
|
||||
|
||||
## # cd to path with Vagrantfile so "vagrant ssh" knows what to do
|
||||
## script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
## cd $script_dir
|
||||
##
|
||||
## vagrant ssh -- \
|
||||
## PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages \
|
||||
## /home/vagrant/brozzler-ve34/bin/python \
|
||||
## /home/vagrant/brozzler-ve34/bin/brozzler-new-job "$@"
|
||||
|
@ -10,7 +10,7 @@ This is a standalone script with no dependencies other than python, and should
|
||||
work with python 2.7 or python 3.2+. The only reason it's not a bash script is
|
||||
so we can use the argparse library.
|
||||
|
||||
Copyright (C) 2014-2016 Internet Archive
|
||||
Copyright (C) 2016 Internet Archive
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
|
Loading…
x
Reference in New Issue
Block a user