mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
vagrant setup (unfinished)
This commit is contained in:
parent
79ad57669c
commit
2aef00826b
19 changed files with 369 additions and 1 deletions
2
setup.py
2
setup.py
|
@ -21,7 +21,7 @@ import setuptools
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1.dev40',
|
version='1.1.dev41',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
61
vagrant/README.rst
Normal file
61
vagrant/README.rst
Normal file
|
@ -0,0 +1,61 @@
|
||||||
|
Single-VM Vagrant Brozzler Deployment
|
||||||
|
-------------------------------------
|
||||||
|
|
||||||
|
This is a work in progress. Vagrant + ansible configuration for a single-vm
|
||||||
|
deployment of brozzler and warcprox with dependencies (notably rethinkdb).
|
||||||
|
|
||||||
|
The idea is for this to be a quick way for people to get up and running with a
|
||||||
|
deployment resembling a real distributed deployment, and to offer a starting
|
||||||
|
configuration for people to adapt to their clusters.
|
||||||
|
|
||||||
|
And equally important, as a harness for integration tests. (As of now brozzler
|
||||||
|
itself has no automated tests!)
|
||||||
|
|
||||||
|
You'll need vagrant installed.
|
||||||
|
https://www.vagrantup.com/docs/installation/
|
||||||
|
Then run:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
my-laptop$ vagrant up
|
||||||
|
|
||||||
|
Currently to start a crawl you first need to ssh to the vagrant vm and activate
|
||||||
|
the brozzler virtualenv.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
my-laptop$ vagrant ssh
|
||||||
|
vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate
|
||||||
|
(brozzler-ve34)vagrant@brozzler-easy:~$
|
||||||
|
|
||||||
|
Then you can run brozzler-new-site:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
|
||||||
|
--proxy=localhost:8000 --enable-warcprox-features \
|
||||||
|
http://example.com/
|
||||||
|
|
||||||
|
|
||||||
|
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
|
||||||
|
id: job1
|
||||||
|
proxy: localhost:8000 # point at warcprox for archiving
|
||||||
|
enable_warcprox_features: true
|
||||||
|
seeds:
|
||||||
|
- url: https://example.org/
|
||||||
|
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
|
||||||
|
|
||||||
|
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
|
||||||
|
./logs (via vagrant folders syncing).
|
||||||
|
|
||||||
|
You can also look at the rethinkdb console by opening http://localhost:8080 in
|
||||||
|
your browser after opening an ssh tunnel like so:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
my-laptop$ vagrant ssh -- -fN -Llocalhost:8080:localhost:8080
|
||||||
|
|
14
vagrant/Vagrantfile
vendored
Normal file
14
vagrant/Vagrantfile
vendored
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
Vagrant.configure(2) do |config|
|
||||||
|
config.vm.box = "ubuntu/trusty64"
|
||||||
|
config.vm.hostname = "brozzler-easy"
|
||||||
|
|
||||||
|
config.vm.provision "ansible" do |ansible|
|
||||||
|
ansible.playbook = "ansible/playbook.yml"
|
||||||
|
ansible.groups = {
|
||||||
|
"rethinkdb" => ["default"],
|
||||||
|
"warcprox" => ["default"],
|
||||||
|
"brozzler-worker" => ["default"],
|
||||||
|
# "brozzler-webconsole" => ["default"],
|
||||||
|
}
|
||||||
|
end
|
||||||
|
end
|
1
vagrant/ansible/playbook.retry
Normal file
1
vagrant/ansible/playbook.retry
Normal file
|
@ -0,0 +1 @@
|
||||||
|
default
|
30
vagrant/ansible/playbook.yml
Normal file
30
vagrant/ansible/playbook.yml
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
---
|
||||||
|
- name: apply common configuration to all nodes
|
||||||
|
hosts: all
|
||||||
|
roles:
|
||||||
|
- common
|
||||||
|
|
||||||
|
- name: deploy rethinkdb
|
||||||
|
hosts: rethinkdb
|
||||||
|
roles:
|
||||||
|
- rethinkdb
|
||||||
|
|
||||||
|
- name: deploy warcprox
|
||||||
|
hosts: warcprox
|
||||||
|
roles:
|
||||||
|
- warcprox
|
||||||
|
|
||||||
|
- name: deploy brozzler-worker
|
||||||
|
hosts: brozzler-worker
|
||||||
|
roles:
|
||||||
|
- brozzler-worker
|
||||||
|
|
||||||
|
# - name: deploy brozzler-webconsole
|
||||||
|
# hosts: brozzler-webconsole
|
||||||
|
# roles:
|
||||||
|
# - brozzler-webconsole
|
||||||
|
|
||||||
|
# - name: deploy pywb
|
||||||
|
# hosts: pywb
|
||||||
|
# roles:
|
||||||
|
# - pywb
|
19
vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml
Normal file
19
vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
---
|
||||||
|
- name: git clone https://github.com/internetarchive/brozzler.git
|
||||||
|
git: repo=https://github.com/internetarchive/brozzler.git
|
||||||
|
dest=/home/vagrant/brozzler
|
||||||
|
- name: pip install -r requirements.txt in virtualenv
|
||||||
|
pip: requirements=/home/vagrant/brozzler/webconsole/requirements.txt
|
||||||
|
virtualenv=/home/vagrant/brozzler-webconsole-ve34
|
||||||
|
virtualenv_python=python3.4
|
||||||
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
notify:
|
||||||
|
- restart brozzler-webconsole
|
||||||
|
- name: install upstart config /etc/init/brozzler-webconsole.conf
|
||||||
|
become: true
|
||||||
|
template: src=templates/brozzler-webconsole.conf.j2
|
||||||
|
dest=/etc/init/brozzler-webconsole.conf
|
||||||
|
notify:
|
||||||
|
- restart brozzler-webconsole
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,21 @@
|
||||||
|
description "brozzler-webconsole"
|
||||||
|
|
||||||
|
start on runlevel [2345]
|
||||||
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
|
env PYTHONPATH=/home/vagrant/brozzler-webconsole-ve34/lib/python3.4/site-packages:/home/vagrant/brozzler/webconsole
|
||||||
|
env PATH=/home/vagrant/brozzler-webconsole-ve34/bin:/usr/bin:/bin
|
||||||
|
env LC_ALL=C.UTF-8
|
||||||
|
|
||||||
|
env WAYBACK_BASEURL={{base_wayback_url}}/all
|
||||||
|
# env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
|
||||||
|
env RETHINKDB_SERVERS=localhost
|
||||||
|
env RETHINKDB_DB={{rethinkdb_db}}
|
||||||
|
|
||||||
|
setuid vagrant
|
||||||
|
|
||||||
|
# console log
|
||||||
|
|
||||||
|
exec gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app >&/vagrant/logs/brozzler-webconsole.log
|
||||||
|
|
||||||
|
|
13
vagrant/ansible/roles/brozzler-worker/handlers/main.yml
Normal file
13
vagrant/ansible/roles/brozzler-worker/handlers/main.yml
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
---
|
||||||
|
- name: restart Xvnc
|
||||||
|
service: name=Xvnc state=restarted
|
||||||
|
become: true
|
||||||
|
- name: restart websockify
|
||||||
|
service: name=websockify state=restarted
|
||||||
|
become: true
|
||||||
|
- name: restart vnc-websock
|
||||||
|
service: name=vnc-websock state=restarted
|
||||||
|
become: true
|
||||||
|
- name: restart brozzler-worker
|
||||||
|
service: name=brozzler-worker state=restarted
|
||||||
|
become: true
|
59
vagrant/ansible/roles/brozzler-worker/tasks/main.yml
Normal file
59
vagrant/ansible/roles/brozzler-worker/tasks/main.yml
Normal file
|
@ -0,0 +1,59 @@
|
||||||
|
---
|
||||||
|
- name: ensure required packages are installed
|
||||||
|
become: true
|
||||||
|
apt: name={{item}} state=present
|
||||||
|
with_items:
|
||||||
|
- python-virtualenv
|
||||||
|
- vnc4server
|
||||||
|
- chromium-browser
|
||||||
|
- xfonts-base
|
||||||
|
- fonts-arphic-bkai00mp
|
||||||
|
- fonts-arphic-bsmi00lp
|
||||||
|
- fonts-arphic-gbsn00lp
|
||||||
|
- fonts-arphic-gkai00mp
|
||||||
|
- fonts-arphic-ukai
|
||||||
|
- fonts-farsiweb
|
||||||
|
- fonts-nafees
|
||||||
|
- fonts-sil-abyssinica
|
||||||
|
- fonts-sil-ezra
|
||||||
|
- fonts-sil-padauk
|
||||||
|
- fonts-unfonts-extra
|
||||||
|
- fonts-unfonts-core
|
||||||
|
- ttf-indic-fonts
|
||||||
|
- fonts-thai-tlwg
|
||||||
|
- fonts-lklug-sinhala
|
||||||
|
- python3-pip
|
||||||
|
- git
|
||||||
|
- libjpeg-turbo8-dev
|
||||||
|
- zlib1g-dev
|
||||||
|
- gcc
|
||||||
|
- libpython3.4-dev
|
||||||
|
- git
|
||||||
|
- name: install Xvnc upstart config /etc/init/Xvnc.conf
|
||||||
|
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
|
||||||
|
become: true
|
||||||
|
notify:
|
||||||
|
- restart Xvnc
|
||||||
|
- name: install websockify in virtualenv
|
||||||
|
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
|
||||||
|
virtualenv=/home/vagrant/websockify-ve34
|
||||||
|
virtualenv_python=python3.4
|
||||||
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
|
||||||
|
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
|
||||||
|
become: true
|
||||||
|
notify:
|
||||||
|
- restart vnc-websock
|
||||||
|
- name: install brozzler in virtualenv
|
||||||
|
become: true
|
||||||
|
pip: name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||||
|
virtualenv=/home/vagrant/brozzler-ve34
|
||||||
|
virtualenv_python=python3.4
|
||||||
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
notify:
|
||||||
|
- restart brozzler-worker
|
||||||
|
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
|
||||||
|
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
|
||||||
|
become: true
|
||||||
|
notify:
|
||||||
|
- restart brozzler-worker
|
14
vagrant/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2
Normal file
14
vagrant/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
description "Xvnc"
|
||||||
|
|
||||||
|
start on runlevel [2345]
|
||||||
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
|
setuid vagrant
|
||||||
|
|
||||||
|
console log
|
||||||
|
|
||||||
|
exec nice Xvnc4 :1 -auth /tmp/Xauthority.vagrant \
|
||||||
|
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
|
||||||
|
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
|
||||||
|
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0
|
||||||
|
|
|
@ -0,0 +1,25 @@
|
||||||
|
description "brozzler-worker"
|
||||||
|
|
||||||
|
start on runlevel [2345]
|
||||||
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
|
env DISPLAY=:1
|
||||||
|
env PATH=/home/vagrant/brozzler-ve34/bin:/usr/bin:/bin
|
||||||
|
env PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages
|
||||||
|
env LANG=C.UTF-8
|
||||||
|
|
||||||
|
setuid vagrant
|
||||||
|
|
||||||
|
# console log
|
||||||
|
|
||||||
|
# depends on vnc server
|
||||||
|
start on started Xvnc
|
||||||
|
stop on stopping Xvnc
|
||||||
|
|
||||||
|
kill timeout 60
|
||||||
|
|
||||||
|
exec nice brozzler-worker \
|
||||||
|
--rethinkdb-servers=localhost \
|
||||||
|
--max-browsers=4 >>/vagrant/logs/brozzler-worker.log 2>&1
|
||||||
|
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
||||||
|
|
|
@ -0,0 +1,14 @@
|
||||||
|
description "vnc-websock"
|
||||||
|
|
||||||
|
start on runlevel [2345]
|
||||||
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
|
setuid vagrant
|
||||||
|
|
||||||
|
console log
|
||||||
|
|
||||||
|
env PYTHONPATH=/home/vagrant/websockify-ve34/lib/python3.4/site-packages
|
||||||
|
env PATH=/home/vagrant/websockify-ve34/bin:/usr/bin:/bin
|
||||||
|
|
||||||
|
exec nice websockify 0.0.0.0:8901 localhost:5901
|
||||||
|
|
4
vagrant/ansible/roles/common/tasks/main.yml
Normal file
4
vagrant/ansible/roles/common/tasks/main.yml
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
---
|
||||||
|
- name: ensure logs directory exists
|
||||||
|
file: path=/vagrant/logs state=directory
|
||||||
|
become: true
|
4
vagrant/ansible/roles/rethinkdb/handlers/main.yml
Normal file
4
vagrant/ansible/roles/rethinkdb/handlers/main.yml
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
---
|
||||||
|
- name: restart rethinkdb
|
||||||
|
service: name=rethinkdb state=restarted
|
||||||
|
become: true
|
19
vagrant/ansible/roles/rethinkdb/tasks/main.yml
Normal file
19
vagrant/ansible/roles/rethinkdb/tasks/main.yml
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
---
|
||||||
|
- name: ensure rethinkdb apt public key is trusted
|
||||||
|
apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
|
||||||
|
become: true
|
||||||
|
- name: ensure rethinkdb repo is in apt sources.list
|
||||||
|
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
|
||||||
|
state=present
|
||||||
|
become: true
|
||||||
|
- name: ensure rethinkdb package is installed
|
||||||
|
apt: name=rethinkdb state=present
|
||||||
|
become: true
|
||||||
|
notify:
|
||||||
|
- restart rethinkdb
|
||||||
|
- name: ensure rethinkdb instance config file is installed
|
||||||
|
template: src=templates/rethinkdb-brozzler-easy.conf.j2
|
||||||
|
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf
|
||||||
|
become: true
|
||||||
|
notify:
|
||||||
|
- restart rethinkdb
|
|
@ -0,0 +1,5 @@
|
||||||
|
runuser=vagrant
|
||||||
|
# bind=0.0.0.0
|
||||||
|
# directory=/var/lib/rethinkdb
|
||||||
|
# log-file=/var/log/rethinkdb.log
|
||||||
|
log-file=/vagrant/logs/rethinkdb.log # synced dir
|
14
vagrant/ansible/roles/warcprox/handlers/main.yml
Normal file
14
vagrant/ansible/roles/warcprox/handlers/main.yml
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
---
|
||||||
|
# - name: start warcprox
|
||||||
|
# environment:
|
||||||
|
# PYTHONPATH: /home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
||||||
|
# PATH: /home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
||||||
|
# args:
|
||||||
|
# executable: /bin/bash
|
||||||
|
# shell: nice warcprox --dir=/vagrant/warcs --base32 --gzip
|
||||||
|
# --rollover-idle-time=180 --cacert=/vagrant/warcprox-ca.pem
|
||||||
|
# --onion-tor-socks-proxy=localhost:9050 --rethinkdb-servers=localhost
|
||||||
|
# --rethinkdb-big-table &> /vagrant/logs/warcprox.out &
|
||||||
|
- name: restart warcprox
|
||||||
|
service: name=warcprox state=restarted
|
||||||
|
become: true
|
25
vagrant/ansible/roles/warcprox/tasks/main.yml
Normal file
25
vagrant/ansible/roles/warcprox/tasks/main.yml
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
---
|
||||||
|
- name: ensure required packages are installed
|
||||||
|
become: true
|
||||||
|
apt: name={{item}} state=present
|
||||||
|
with_items:
|
||||||
|
- gcc
|
||||||
|
- python-virtualenv
|
||||||
|
- python3.4
|
||||||
|
- libpython3.4-dev
|
||||||
|
- libffi-dev
|
||||||
|
- libssl-dev
|
||||||
|
- tor
|
||||||
|
- git
|
||||||
|
- name: install warcprox in virtualenv
|
||||||
|
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
|
||||||
|
virtualenv=/home/vagrant/warcprox-ve34
|
||||||
|
virtualenv_python=python3.4
|
||||||
|
extra_args='--no-input --upgrade --pre'
|
||||||
|
notify:
|
||||||
|
- restart warcprox
|
||||||
|
- name: install upstart config /etc/init/warcprox.conf
|
||||||
|
become: true
|
||||||
|
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
|
||||||
|
notify:
|
||||||
|
- restart warcprox
|
26
vagrant/ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
26
vagrant/ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
description "warcprox"
|
||||||
|
|
||||||
|
start on runlevel [2345]
|
||||||
|
stop on runlevel [!2345]
|
||||||
|
|
||||||
|
env PYTHONPATH=/home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
||||||
|
env PATH=/home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
||||||
|
|
||||||
|
# by default warcprox creates some files/dirs relative to cwd
|
||||||
|
chdir /home/vagrant
|
||||||
|
setuid vagrant
|
||||||
|
|
||||||
|
# console log
|
||||||
|
|
||||||
|
# --profile
|
||||||
|
exec nice warcprox \
|
||||||
|
--dir=/vagrant/warcs \
|
||||||
|
--base32 \
|
||||||
|
--gzip \
|
||||||
|
--rollover-idle-time=180 \
|
||||||
|
--cacert=/vagrant/warcprox-ca.pem \
|
||||||
|
--onion-tor-socks-proxy=localhost:9050 \
|
||||||
|
--rethinkdb-servers=localhost \
|
||||||
|
--rethinkdb-db=brozzler \
|
||||||
|
--rethinkdb-big-table >>/vagrant/logs/warcprox.log 2>&1
|
||||||
|
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
Loading…
Add table
Add a link
Reference in a new issue