mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 23:35:54 -04:00
vagrant setup (unfinished)
This commit is contained in:
parent
79ad57669c
commit
2aef00826b
2
setup.py
2
setup.py
@ -21,7 +21,7 @@ import setuptools
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1.dev40',
|
||||
version='1.1.dev41',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
61
vagrant/README.rst
Normal file
61
vagrant/README.rst
Normal file
@ -0,0 +1,61 @@
|
||||
Single-VM Vagrant Brozzler Deployment
|
||||
-------------------------------------
|
||||
|
||||
This is a work in progress. Vagrant + ansible configuration for a single-vm
|
||||
deployment of brozzler and warcprox with dependencies (notably rethinkdb).
|
||||
|
||||
The idea is for this to be a quick way for people to get up and running with a
|
||||
deployment resembling a real distributed deployment, and to offer a starting
|
||||
configuration for people to adapt to their clusters.
|
||||
|
||||
And equally important, as a harness for integration tests. (As of now brozzler
|
||||
itself has no automated tests!)
|
||||
|
||||
You'll need vagrant installed.
|
||||
https://www.vagrantup.com/docs/installation/
|
||||
Then run:
|
||||
|
||||
::
|
||||
|
||||
my-laptop$ vagrant up
|
||||
|
||||
Currently to start a crawl you first need to ssh to the vagrant vm and activate
|
||||
the brozzler virtualenv.
|
||||
|
||||
::
|
||||
|
||||
my-laptop$ vagrant ssh
|
||||
vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$
|
||||
|
||||
Then you can run brozzler-new-site:
|
||||
|
||||
::
|
||||
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
|
||||
--proxy=localhost:8000 --enable-warcprox-features \
|
||||
http://example.com/
|
||||
|
||||
|
||||
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||
|
||||
::
|
||||
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
|
||||
id: job1
|
||||
proxy: localhost:8000 # point at warcprox for archiving
|
||||
enable_warcprox_features: true
|
||||
seeds:
|
||||
- url: https://example.org/
|
||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
|
||||
|
||||
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
|
||||
./logs (via vagrant folders syncing).
|
||||
|
||||
You can also look at the rethinkdb console by opening http://localhost:8080 in
|
||||
your browser after opening an ssh tunnel like so:
|
||||
|
||||
::
|
||||
|
||||
my-laptop$ vagrant ssh -- -fN -Llocalhost:8080:localhost:8080
|
||||
|
14
vagrant/Vagrantfile
vendored
Normal file
14
vagrant/Vagrantfile
vendored
Normal file
@ -0,0 +1,14 @@
|
||||
Vagrant.configure(2) do |config|
|
||||
config.vm.box = "ubuntu/trusty64"
|
||||
config.vm.hostname = "brozzler-easy"
|
||||
|
||||
config.vm.provision "ansible" do |ansible|
|
||||
ansible.playbook = "ansible/playbook.yml"
|
||||
ansible.groups = {
|
||||
"rethinkdb" => ["default"],
|
||||
"warcprox" => ["default"],
|
||||
"brozzler-worker" => ["default"],
|
||||
# "brozzler-webconsole" => ["default"],
|
||||
}
|
||||
end
|
||||
end
|
1
vagrant/ansible/playbook.retry
Normal file
1
vagrant/ansible/playbook.retry
Normal file
@ -0,0 +1 @@
|
||||
default
|
30
vagrant/ansible/playbook.yml
Normal file
30
vagrant/ansible/playbook.yml
Normal file
@ -0,0 +1,30 @@
|
||||
---
|
||||
- name: apply common configuration to all nodes
|
||||
hosts: all
|
||||
roles:
|
||||
- common
|
||||
|
||||
- name: deploy rethinkdb
|
||||
hosts: rethinkdb
|
||||
roles:
|
||||
- rethinkdb
|
||||
|
||||
- name: deploy warcprox
|
||||
hosts: warcprox
|
||||
roles:
|
||||
- warcprox
|
||||
|
||||
- name: deploy brozzler-worker
|
||||
hosts: brozzler-worker
|
||||
roles:
|
||||
- brozzler-worker
|
||||
|
||||
# - name: deploy brozzler-webconsole
|
||||
# hosts: brozzler-webconsole
|
||||
# roles:
|
||||
# - brozzler-webconsole
|
||||
|
||||
# - name: deploy pywb
|
||||
# hosts: pywb
|
||||
# roles:
|
||||
# - pywb
|
19
vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml
Normal file
19
vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml
Normal file
@ -0,0 +1,19 @@
|
||||
---
|
||||
- name: git clone https://github.com/internetarchive/brozzler.git
|
||||
git: repo=https://github.com/internetarchive/brozzler.git
|
||||
dest=/home/vagrant/brozzler
|
||||
- name: pip install -r requirements.txt in virtualenv
|
||||
pip: requirements=/home/vagrant/brozzler/webconsole/requirements.txt
|
||||
virtualenv=/home/vagrant/brozzler-webconsole-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
notify:
|
||||
- restart brozzler-webconsole
|
||||
- name: install upstart config /etc/init/brozzler-webconsole.conf
|
||||
become: true
|
||||
template: src=templates/brozzler-webconsole.conf.j2
|
||||
dest=/etc/init/brozzler-webconsole.conf
|
||||
notify:
|
||||
- restart brozzler-webconsole
|
||||
|
||||
|
@ -0,0 +1,21 @@
|
||||
description "brozzler-webconsole"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH=/home/vagrant/brozzler-webconsole-ve34/lib/python3.4/site-packages:/home/vagrant/brozzler/webconsole
|
||||
env PATH=/home/vagrant/brozzler-webconsole-ve34/bin:/usr/bin:/bin
|
||||
env LC_ALL=C.UTF-8
|
||||
|
||||
env WAYBACK_BASEURL={{base_wayback_url}}/all
|
||||
# env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
|
||||
env RETHINKDB_SERVERS=localhost
|
||||
env RETHINKDB_DB={{rethinkdb_db}}
|
||||
|
||||
setuid vagrant
|
||||
|
||||
# console log
|
||||
|
||||
exec gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app >&/vagrant/logs/brozzler-webconsole.log
|
||||
|
||||
|
13
vagrant/ansible/roles/brozzler-worker/handlers/main.yml
Normal file
13
vagrant/ansible/roles/brozzler-worker/handlers/main.yml
Normal file
@ -0,0 +1,13 @@
|
||||
---
|
||||
- name: restart Xvnc
|
||||
service: name=Xvnc state=restarted
|
||||
become: true
|
||||
- name: restart websockify
|
||||
service: name=websockify state=restarted
|
||||
become: true
|
||||
- name: restart vnc-websock
|
||||
service: name=vnc-websock state=restarted
|
||||
become: true
|
||||
- name: restart brozzler-worker
|
||||
service: name=brozzler-worker state=restarted
|
||||
become: true
|
59
vagrant/ansible/roles/brozzler-worker/tasks/main.yml
Normal file
59
vagrant/ansible/roles/brozzler-worker/tasks/main.yml
Normal file
@ -0,0 +1,59 @@
|
||||
---
|
||||
- name: ensure required packages are installed
|
||||
become: true
|
||||
apt: name={{item}} state=present
|
||||
with_items:
|
||||
- python-virtualenv
|
||||
- vnc4server
|
||||
- chromium-browser
|
||||
- xfonts-base
|
||||
- fonts-arphic-bkai00mp
|
||||
- fonts-arphic-bsmi00lp
|
||||
- fonts-arphic-gbsn00lp
|
||||
- fonts-arphic-gkai00mp
|
||||
- fonts-arphic-ukai
|
||||
- fonts-farsiweb
|
||||
- fonts-nafees
|
||||
- fonts-sil-abyssinica
|
||||
- fonts-sil-ezra
|
||||
- fonts-sil-padauk
|
||||
- fonts-unfonts-extra
|
||||
- fonts-unfonts-core
|
||||
- ttf-indic-fonts
|
||||
- fonts-thai-tlwg
|
||||
- fonts-lklug-sinhala
|
||||
- python3-pip
|
||||
- git
|
||||
- libjpeg-turbo8-dev
|
||||
- zlib1g-dev
|
||||
- gcc
|
||||
- libpython3.4-dev
|
||||
- git
|
||||
- name: install Xvnc upstart config /etc/init/Xvnc.conf
|
||||
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart Xvnc
|
||||
- name: install websockify in virtualenv
|
||||
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
|
||||
virtualenv=/home/vagrant/websockify-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
|
||||
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart vnc-websock
|
||||
- name: install brozzler in virtualenv
|
||||
become: true
|
||||
pip: name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
|
||||
virtualenv=/home/vagrant/brozzler-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
notify:
|
||||
- restart brozzler-worker
|
||||
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
|
||||
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart brozzler-worker
|
14
vagrant/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2
Normal file
14
vagrant/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2
Normal file
@ -0,0 +1,14 @@
|
||||
description "Xvnc"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
setuid vagrant
|
||||
|
||||
console log
|
||||
|
||||
exec nice Xvnc4 :1 -auth /tmp/Xauthority.vagrant \
|
||||
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
|
||||
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
|
||||
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0
|
||||
|
@ -0,0 +1,25 @@
|
||||
description "brozzler-worker"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env DISPLAY=:1
|
||||
env PATH=/home/vagrant/brozzler-ve34/bin:/usr/bin:/bin
|
||||
env PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages
|
||||
env LANG=C.UTF-8
|
||||
|
||||
setuid vagrant
|
||||
|
||||
# console log
|
||||
|
||||
# depends on vnc server
|
||||
start on started Xvnc
|
||||
stop on stopping Xvnc
|
||||
|
||||
kill timeout 60
|
||||
|
||||
exec nice brozzler-worker \
|
||||
--rethinkdb-servers=localhost \
|
||||
--max-browsers=4 >>/vagrant/logs/brozzler-worker.log 2>&1
|
||||
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
||||
|
@ -0,0 +1,14 @@
|
||||
description "vnc-websock"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
setuid vagrant
|
||||
|
||||
console log
|
||||
|
||||
env PYTHONPATH=/home/vagrant/websockify-ve34/lib/python3.4/site-packages
|
||||
env PATH=/home/vagrant/websockify-ve34/bin:/usr/bin:/bin
|
||||
|
||||
exec nice websockify 0.0.0.0:8901 localhost:5901
|
||||
|
4
vagrant/ansible/roles/common/tasks/main.yml
Normal file
4
vagrant/ansible/roles/common/tasks/main.yml
Normal file
@ -0,0 +1,4 @@
|
||||
---
|
||||
- name: ensure logs directory exists
|
||||
file: path=/vagrant/logs state=directory
|
||||
become: true
|
4
vagrant/ansible/roles/rethinkdb/handlers/main.yml
Normal file
4
vagrant/ansible/roles/rethinkdb/handlers/main.yml
Normal file
@ -0,0 +1,4 @@
|
||||
---
|
||||
- name: restart rethinkdb
|
||||
service: name=rethinkdb state=restarted
|
||||
become: true
|
19
vagrant/ansible/roles/rethinkdb/tasks/main.yml
Normal file
19
vagrant/ansible/roles/rethinkdb/tasks/main.yml
Normal file
@ -0,0 +1,19 @@
|
||||
---
|
||||
- name: ensure rethinkdb apt public key is trusted
|
||||
apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
|
||||
become: true
|
||||
- name: ensure rethinkdb repo is in apt sources.list
|
||||
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
|
||||
state=present
|
||||
become: true
|
||||
- name: ensure rethinkdb package is installed
|
||||
apt: name=rethinkdb state=present
|
||||
become: true
|
||||
notify:
|
||||
- restart rethinkdb
|
||||
- name: ensure rethinkdb instance config file is installed
|
||||
template: src=templates/rethinkdb-brozzler-easy.conf.j2
|
||||
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf
|
||||
become: true
|
||||
notify:
|
||||
- restart rethinkdb
|
@ -0,0 +1,5 @@
|
||||
runuser=vagrant
|
||||
# bind=0.0.0.0
|
||||
# directory=/var/lib/rethinkdb
|
||||
# log-file=/var/log/rethinkdb.log
|
||||
log-file=/vagrant/logs/rethinkdb.log # synced dir
|
14
vagrant/ansible/roles/warcprox/handlers/main.yml
Normal file
14
vagrant/ansible/roles/warcprox/handlers/main.yml
Normal file
@ -0,0 +1,14 @@
|
||||
---
|
||||
# - name: start warcprox
|
||||
# environment:
|
||||
# PYTHONPATH: /home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
||||
# PATH: /home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
||||
# args:
|
||||
# executable: /bin/bash
|
||||
# shell: nice warcprox --dir=/vagrant/warcs --base32 --gzip
|
||||
# --rollover-idle-time=180 --cacert=/vagrant/warcprox-ca.pem
|
||||
# --onion-tor-socks-proxy=localhost:9050 --rethinkdb-servers=localhost
|
||||
# --rethinkdb-big-table &> /vagrant/logs/warcprox.out &
|
||||
- name: restart warcprox
|
||||
service: name=warcprox state=restarted
|
||||
become: true
|
25
vagrant/ansible/roles/warcprox/tasks/main.yml
Normal file
25
vagrant/ansible/roles/warcprox/tasks/main.yml
Normal file
@ -0,0 +1,25 @@
|
||||
---
|
||||
- name: ensure required packages are installed
|
||||
become: true
|
||||
apt: name={{item}} state=present
|
||||
with_items:
|
||||
- gcc
|
||||
- python-virtualenv
|
||||
- python3.4
|
||||
- libpython3.4-dev
|
||||
- libffi-dev
|
||||
- libssl-dev
|
||||
- tor
|
||||
- git
|
||||
- name: install warcprox in virtualenv
|
||||
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
|
||||
virtualenv=/home/vagrant/warcprox-ve34
|
||||
virtualenv_python=python3.4
|
||||
extra_args='--no-input --upgrade --pre'
|
||||
notify:
|
||||
- restart warcprox
|
||||
- name: install upstart config /etc/init/warcprox.conf
|
||||
become: true
|
||||
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
|
||||
notify:
|
||||
- restart warcprox
|
26
vagrant/ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
26
vagrant/ansible/roles/warcprox/templates/warcprox.conf.j2
Normal file
@ -0,0 +1,26 @@
|
||||
description "warcprox"
|
||||
|
||||
start on runlevel [2345]
|
||||
stop on runlevel [!2345]
|
||||
|
||||
env PYTHONPATH=/home/vagrant/warcprox-ve34/lib/python3.4/site-packages
|
||||
env PATH=/home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
|
||||
|
||||
# by default warcprox creates some files/dirs relative to cwd
|
||||
chdir /home/vagrant
|
||||
setuid vagrant
|
||||
|
||||
# console log
|
||||
|
||||
# --profile
|
||||
exec nice warcprox \
|
||||
--dir=/vagrant/warcs \
|
||||
--base32 \
|
||||
--gzip \
|
||||
--rollover-idle-time=180 \
|
||||
--cacert=/vagrant/warcprox-ca.pem \
|
||||
--onion-tor-socks-proxy=localhost:9050 \
|
||||
--rethinkdb-servers=localhost \
|
||||
--rethinkdb-db=brozzler \
|
||||
--rethinkdb-big-table >>/vagrant/logs/warcprox.log 2>&1
|
||||
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
|
Loading…
x
Reference in New Issue
Block a user