vagrant setup (unfinished)

This commit is contained in:
Noah Levitt 2016-06-30 17:50:11 -05:00
parent 79ad57669c
commit 2aef00826b
19 changed files with 369 additions and 1 deletions

View File

@ -21,7 +21,7 @@ import setuptools
setuptools.setup(
name='brozzler',
version='1.1.dev40',
version='1.1.dev41',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

61
vagrant/README.rst Normal file
View File

@ -0,0 +1,61 @@
Single-VM Vagrant Brozzler Deployment
-------------------------------------
This is a work in progress. Vagrant + ansible configuration for a single-vm
deployment of brozzler and warcprox with dependencies (notably rethinkdb).
The idea is for this to be a quick way for people to get up and running with a
deployment resembling a real distributed deployment, and to offer a starting
configuration for people to adapt to their clusters.
And equally important, as a harness for integration tests. (As of now brozzler
itself has no automated tests!)
You'll need vagrant installed.
https://www.vagrantup.com/docs/installation/
Then run:
::
my-laptop$ vagrant up
Currently to start a crawl you first need to ssh to the vagrant vm and activate
the brozzler virtualenv.
::
my-laptop$ vagrant ssh
vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate
(brozzler-ve34)vagrant@brozzler-easy:~$
Then you can run brozzler-new-site:
::
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
--proxy=localhost:8000 --enable-warcprox-features \
http://example.com/
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
::
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
id: job1
proxy: localhost:8000 # point at warcprox for archiving
enable_warcprox_features: true
seeds:
- url: https://example.org/
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
./logs (via vagrant folders syncing).
You can also look at the rethinkdb console by opening http://localhost:8080 in
your browser after opening an ssh tunnel like so:
::
my-laptop$ vagrant ssh -- -fN -Llocalhost:8080:localhost:8080

14
vagrant/Vagrantfile vendored Normal file
View File

@ -0,0 +1,14 @@
Vagrant.configure(2) do |config|
config.vm.box = "ubuntu/trusty64"
config.vm.hostname = "brozzler-easy"
config.vm.provision "ansible" do |ansible|
ansible.playbook = "ansible/playbook.yml"
ansible.groups = {
"rethinkdb" => ["default"],
"warcprox" => ["default"],
"brozzler-worker" => ["default"],
# "brozzler-webconsole" => ["default"],
}
end
end

View File

@ -0,0 +1 @@
default

View File

@ -0,0 +1,30 @@
---
- name: apply common configuration to all nodes
hosts: all
roles:
- common
- name: deploy rethinkdb
hosts: rethinkdb
roles:
- rethinkdb
- name: deploy warcprox
hosts: warcprox
roles:
- warcprox
- name: deploy brozzler-worker
hosts: brozzler-worker
roles:
- brozzler-worker
# - name: deploy brozzler-webconsole
# hosts: brozzler-webconsole
# roles:
# - brozzler-webconsole
# - name: deploy pywb
# hosts: pywb
# roles:
# - pywb

View File

@ -0,0 +1,19 @@
---
- name: git clone https://github.com/internetarchive/brozzler.git
git: repo=https://github.com/internetarchive/brozzler.git
dest=/home/vagrant/brozzler
- name: pip install -r requirements.txt in virtualenv
pip: requirements=/home/vagrant/brozzler/webconsole/requirements.txt
virtualenv=/home/vagrant/brozzler-webconsole-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre'
notify:
- restart brozzler-webconsole
- name: install upstart config /etc/init/brozzler-webconsole.conf
become: true
template: src=templates/brozzler-webconsole.conf.j2
dest=/etc/init/brozzler-webconsole.conf
notify:
- restart brozzler-webconsole

View File

@ -0,0 +1,21 @@
description "brozzler-webconsole"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH=/home/vagrant/brozzler-webconsole-ve34/lib/python3.4/site-packages:/home/vagrant/brozzler/webconsole
env PATH=/home/vagrant/brozzler-webconsole-ve34/bin:/usr/bin:/bin
env LC_ALL=C.UTF-8
env WAYBACK_BASEURL={{base_wayback_url}}/all
# env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
env RETHINKDB_SERVERS=localhost
env RETHINKDB_DB={{rethinkdb_db}}
setuid vagrant
# console log
exec gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app >&/vagrant/logs/brozzler-webconsole.log

View File

@ -0,0 +1,13 @@
---
- name: restart Xvnc
service: name=Xvnc state=restarted
become: true
- name: restart websockify
service: name=websockify state=restarted
become: true
- name: restart vnc-websock
service: name=vnc-websock state=restarted
become: true
- name: restart brozzler-worker
service: name=brozzler-worker state=restarted
become: true

View File

@ -0,0 +1,59 @@
---
- name: ensure required packages are installed
become: true
apt: name={{item}} state=present
with_items:
- python-virtualenv
- vnc4server
- chromium-browser
- xfonts-base
- fonts-arphic-bkai00mp
- fonts-arphic-bsmi00lp
- fonts-arphic-gbsn00lp
- fonts-arphic-gkai00mp
- fonts-arphic-ukai
- fonts-farsiweb
- fonts-nafees
- fonts-sil-abyssinica
- fonts-sil-ezra
- fonts-sil-padauk
- fonts-unfonts-extra
- fonts-unfonts-core
- ttf-indic-fonts
- fonts-thai-tlwg
- fonts-lklug-sinhala
- python3-pip
- git
- libjpeg-turbo8-dev
- zlib1g-dev
- gcc
- libpython3.4-dev
- git
- name: install Xvnc upstart config /etc/init/Xvnc.conf
template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
become: true
notify:
- restart Xvnc
- name: install websockify in virtualenv
pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
virtualenv=/home/vagrant/websockify-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre'
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
become: true
notify:
- restart vnc-websock
- name: install brozzler in virtualenv
become: true
pip: name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
virtualenv=/home/vagrant/brozzler-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre'
notify:
- restart brozzler-worker
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
become: true
notify:
- restart brozzler-worker

View File

@ -0,0 +1,14 @@
description "Xvnc"
start on runlevel [2345]
stop on runlevel [!2345]
setuid vagrant
console log
exec nice Xvnc4 :1 -auth /tmp/Xauthority.vagrant \
-geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
-SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0

View File

@ -0,0 +1,25 @@
description "brozzler-worker"
start on runlevel [2345]
stop on runlevel [!2345]
env DISPLAY=:1
env PATH=/home/vagrant/brozzler-ve34/bin:/usr/bin:/bin
env PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages
env LANG=C.UTF-8
setuid vagrant
# console log
# depends on vnc server
start on started Xvnc
stop on stopping Xvnc
kill timeout 60
exec nice brozzler-worker \
--rethinkdb-servers=localhost \
--max-browsers=4 >>/vagrant/logs/brozzler-worker.log 2>&1
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \

View File

@ -0,0 +1,14 @@
description "vnc-websock"
start on runlevel [2345]
stop on runlevel [!2345]
setuid vagrant
console log
env PYTHONPATH=/home/vagrant/websockify-ve34/lib/python3.4/site-packages
env PATH=/home/vagrant/websockify-ve34/bin:/usr/bin:/bin
exec nice websockify 0.0.0.0:8901 localhost:5901

View File

@ -0,0 +1,4 @@
---
- name: ensure logs directory exists
file: path=/vagrant/logs state=directory
become: true

View File

@ -0,0 +1,4 @@
---
- name: restart rethinkdb
service: name=rethinkdb state=restarted
become: true

View File

@ -0,0 +1,19 @@
---
- name: ensure rethinkdb apt public key is trusted
apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
become: true
- name: ensure rethinkdb repo is in apt sources.list
apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
state=present
become: true
- name: ensure rethinkdb package is installed
apt: name=rethinkdb state=present
become: true
notify:
- restart rethinkdb
- name: ensure rethinkdb instance config file is installed
template: src=templates/rethinkdb-brozzler-easy.conf.j2
dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf
become: true
notify:
- restart rethinkdb

View File

@ -0,0 +1,5 @@
runuser=vagrant
# bind=0.0.0.0
# directory=/var/lib/rethinkdb
# log-file=/var/log/rethinkdb.log
log-file=/vagrant/logs/rethinkdb.log # synced dir

View File

@ -0,0 +1,14 @@
---
# - name: start warcprox
# environment:
# PYTHONPATH: /home/vagrant/warcprox-ve34/lib/python3.4/site-packages
# PATH: /home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
# args:
# executable: /bin/bash
# shell: nice warcprox --dir=/vagrant/warcs --base32 --gzip
# --rollover-idle-time=180 --cacert=/vagrant/warcprox-ca.pem
# --onion-tor-socks-proxy=localhost:9050 --rethinkdb-servers=localhost
# --rethinkdb-big-table &> /vagrant/logs/warcprox.out &
- name: restart warcprox
service: name=warcprox state=restarted
become: true

View File

@ -0,0 +1,25 @@
---
- name: ensure required packages are installed
become: true
apt: name={{item}} state=present
with_items:
- gcc
- python-virtualenv
- python3.4
- libpython3.4-dev
- libffi-dev
- libssl-dev
- tor
- git
- name: install warcprox in virtualenv
pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox
virtualenv=/home/vagrant/warcprox-ve34
virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre'
notify:
- restart warcprox
- name: install upstart config /etc/init/warcprox.conf
become: true
template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
notify:
- restart warcprox

View File

@ -0,0 +1,26 @@
description "warcprox"
start on runlevel [2345]
stop on runlevel [!2345]
env PYTHONPATH=/home/vagrant/warcprox-ve34/lib/python3.4/site-packages
env PATH=/home/vagrant/warcprox-ve34/bin:/usr/bin:/bin
# by default warcprox creates some files/dirs relative to cwd
chdir /home/vagrant
setuid vagrant
# console log
# --profile
exec nice warcprox \
--dir=/vagrant/warcs \
--base32 \
--gzip \
--rollover-idle-time=180 \
--cacert=/vagrant/warcprox-ca.pem \
--onion-tor-socks-proxy=localhost:9050 \
--rethinkdb-servers=localhost \
--rethinkdb-db=brozzler \
--rethinkdb-big-table >>/vagrant/logs/warcprox.log 2>&1
# --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \