diff --git a/setup.py b/setup.py index 68e94b8..adf53b7 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ import setuptools setuptools.setup( name='brozzler', - version='1.1.dev40', + version='1.1.dev41', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/vagrant/README.rst b/vagrant/README.rst new file mode 100644 index 0000000..f546da8 --- /dev/null +++ b/vagrant/README.rst @@ -0,0 +1,61 @@ +Single-VM Vagrant Brozzler Deployment +------------------------------------- + +This is a work in progress. Vagrant + ansible configuration for a single-vm +deployment of brozzler and warcprox with dependencies (notably rethinkdb). + +The idea is for this to be a quick way for people to get up and running with a +deployment resembling a real distributed deployment, and to offer a starting +configuration for people to adapt to their clusters. + +And equally important, as a harness for integration tests. (As of now brozzler +itself has no automated tests!) + +You'll need vagrant installed. +https://www.vagrantup.com/docs/installation/ +Then run: + +:: + + my-laptop$ vagrant up + +Currently to start a crawl you first need to ssh to the vagrant vm and activate +the brozzler virtualenv. + +:: + + my-laptop$ vagrant ssh + vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate + (brozzler-ve34)vagrant@brozzler-easy:~$ + +Then you can run brozzler-new-site: + +:: + + (brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \ + --proxy=localhost:8000 --enable-warcprox-features \ + http://example.com/ + + +Or brozzler-new-job (make sure to set the proxy to localhost:8000): + +:: + + (brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml + id: job1 + proxy: localhost:8000 # point at warcprox for archiving + enable_warcprox_features: true + seeds: + - url: https://example.org/ + (brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml + +WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in +./logs (via vagrant folders syncing). + +You can also look at the rethinkdb console by opening http://localhost:8080 in +your browser after opening an ssh tunnel like so: + +:: + + my-laptop$ vagrant ssh -- -fN -Llocalhost:8080:localhost:8080 + diff --git a/vagrant/Vagrantfile b/vagrant/Vagrantfile new file mode 100644 index 0000000..c4eab85 --- /dev/null +++ b/vagrant/Vagrantfile @@ -0,0 +1,14 @@ +Vagrant.configure(2) do |config| + config.vm.box = "ubuntu/trusty64" + config.vm.hostname = "brozzler-easy" + + config.vm.provision "ansible" do |ansible| + ansible.playbook = "ansible/playbook.yml" + ansible.groups = { + "rethinkdb" => ["default"], + "warcprox" => ["default"], + "brozzler-worker" => ["default"], + # "brozzler-webconsole" => ["default"], + } + end +end diff --git a/vagrant/ansible/playbook.retry b/vagrant/ansible/playbook.retry new file mode 100644 index 0000000..4ad96d5 --- /dev/null +++ b/vagrant/ansible/playbook.retry @@ -0,0 +1 @@ +default diff --git a/vagrant/ansible/playbook.yml b/vagrant/ansible/playbook.yml new file mode 100644 index 0000000..f22f327 --- /dev/null +++ b/vagrant/ansible/playbook.yml @@ -0,0 +1,30 @@ +--- +- name: apply common configuration to all nodes + hosts: all + roles: + - common + +- name: deploy rethinkdb + hosts: rethinkdb + roles: + - rethinkdb + +- name: deploy warcprox + hosts: warcprox + roles: + - warcprox + +- name: deploy brozzler-worker + hosts: brozzler-worker + roles: + - brozzler-worker + +# - name: deploy brozzler-webconsole +# hosts: brozzler-webconsole +# roles: +# - brozzler-webconsole + +# - name: deploy pywb +# hosts: pywb +# roles: +# - pywb diff --git a/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml b/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml new file mode 100644 index 0000000..6b54696 --- /dev/null +++ b/vagrant/ansible/roles/brozzler-webconsole/tasks/main.yml @@ -0,0 +1,19 @@ +--- +- name: git clone https://github.com/internetarchive/brozzler.git + git: repo=https://github.com/internetarchive/brozzler.git + dest=/home/vagrant/brozzler +- name: pip install -r requirements.txt in virtualenv + pip: requirements=/home/vagrant/brozzler/webconsole/requirements.txt + virtualenv=/home/vagrant/brozzler-webconsole-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' + notify: + - restart brozzler-webconsole +- name: install upstart config /etc/init/brozzler-webconsole.conf + become: true + template: src=templates/brozzler-webconsole.conf.j2 + dest=/etc/init/brozzler-webconsole.conf + notify: + - restart brozzler-webconsole + + diff --git a/vagrant/ansible/roles/brozzler-webconsole/templates/brozzler-webconsole.conf.j2 b/vagrant/ansible/roles/brozzler-webconsole/templates/brozzler-webconsole.conf.j2 new file mode 100644 index 0000000..efe2d03 --- /dev/null +++ b/vagrant/ansible/roles/brozzler-webconsole/templates/brozzler-webconsole.conf.j2 @@ -0,0 +1,21 @@ +description "brozzler-webconsole" + +start on runlevel [2345] +stop on runlevel [!2345] + +env PYTHONPATH=/home/vagrant/brozzler-webconsole-ve34/lib/python3.4/site-packages:/home/vagrant/brozzler/webconsole +env PATH=/home/vagrant/brozzler-webconsole-ve34/bin:/usr/bin:/bin +env LC_ALL=C.UTF-8 + +env WAYBACK_BASEURL={{base_wayback_url}}/all +# env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} +env RETHINKDB_SERVERS=localhost +env RETHINKDB_DB={{rethinkdb_db}} + +setuid vagrant + +# console log + +exec gunicorn --bind=0.0.0.0:8081 brozzler-webconsole:app >&/vagrant/logs/brozzler-webconsole.log + + diff --git a/vagrant/ansible/roles/brozzler-worker/handlers/main.yml b/vagrant/ansible/roles/brozzler-worker/handlers/main.yml new file mode 100644 index 0000000..1fac304 --- /dev/null +++ b/vagrant/ansible/roles/brozzler-worker/handlers/main.yml @@ -0,0 +1,13 @@ +--- +- name: restart Xvnc + service: name=Xvnc state=restarted + become: true +- name: restart websockify + service: name=websockify state=restarted + become: true +- name: restart vnc-websock + service: name=vnc-websock state=restarted + become: true +- name: restart brozzler-worker + service: name=brozzler-worker state=restarted + become: true diff --git a/vagrant/ansible/roles/brozzler-worker/tasks/main.yml b/vagrant/ansible/roles/brozzler-worker/tasks/main.yml new file mode 100644 index 0000000..f2a4e23 --- /dev/null +++ b/vagrant/ansible/roles/brozzler-worker/tasks/main.yml @@ -0,0 +1,59 @@ +--- +- name: ensure required packages are installed + become: true + apt: name={{item}} state=present + with_items: + - python-virtualenv + - vnc4server + - chromium-browser + - xfonts-base + - fonts-arphic-bkai00mp + - fonts-arphic-bsmi00lp + - fonts-arphic-gbsn00lp + - fonts-arphic-gkai00mp + - fonts-arphic-ukai + - fonts-farsiweb + - fonts-nafees + - fonts-sil-abyssinica + - fonts-sil-ezra + - fonts-sil-padauk + - fonts-unfonts-extra + - fonts-unfonts-core + - ttf-indic-fonts + - fonts-thai-tlwg + - fonts-lklug-sinhala + - python3-pip + - git + - libjpeg-turbo8-dev + - zlib1g-dev + - gcc + - libpython3.4-dev + - git +- name: install Xvnc upstart config /etc/init/Xvnc.conf + template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf + become: true + notify: + - restart Xvnc +- name: install websockify in virtualenv + pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify + virtualenv=/home/vagrant/websockify-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' +- name: install vnc-websock upstart config /etc/init/vnc-websock.conf + template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf + become: true + notify: + - restart vnc-websock +- name: install brozzler in virtualenv + become: true + pip: name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler + virtualenv=/home/vagrant/brozzler-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' + notify: + - restart brozzler-worker +- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf + template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf + become: true + notify: + - restart brozzler-worker diff --git a/vagrant/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2 b/vagrant/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2 new file mode 100644 index 0000000..6381143 --- /dev/null +++ b/vagrant/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2 @@ -0,0 +1,14 @@ +description "Xvnc" + +start on runlevel [2345] +stop on runlevel [!2345] + +setuid vagrant + +console log + +exec nice Xvnc4 :1 -auth /tmp/Xauthority.vagrant \ + -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \ + -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \ + AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 + diff --git a/vagrant/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 b/vagrant/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 new file mode 100644 index 0000000..4ec328a --- /dev/null +++ b/vagrant/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 @@ -0,0 +1,25 @@ +description "brozzler-worker" + +start on runlevel [2345] +stop on runlevel [!2345] + +env DISPLAY=:1 +env PATH=/home/vagrant/brozzler-ve34/bin:/usr/bin:/bin +env PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages +env LANG=C.UTF-8 + +setuid vagrant + +# console log + +# depends on vnc server +start on started Xvnc +stop on stopping Xvnc + +kill timeout 60 + +exec nice brozzler-worker \ + --rethinkdb-servers=localhost \ + --max-browsers=4 >>/vagrant/logs/brozzler-worker.log 2>&1 + # --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \ + diff --git a/vagrant/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 b/vagrant/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 new file mode 100644 index 0000000..86b4012 --- /dev/null +++ b/vagrant/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 @@ -0,0 +1,14 @@ +description "vnc-websock" + +start on runlevel [2345] +stop on runlevel [!2345] + +setuid vagrant + +console log + +env PYTHONPATH=/home/vagrant/websockify-ve34/lib/python3.4/site-packages +env PATH=/home/vagrant/websockify-ve34/bin:/usr/bin:/bin + +exec nice websockify 0.0.0.0:8901 localhost:5901 + diff --git a/vagrant/ansible/roles/common/tasks/main.yml b/vagrant/ansible/roles/common/tasks/main.yml new file mode 100644 index 0000000..f9012ca --- /dev/null +++ b/vagrant/ansible/roles/common/tasks/main.yml @@ -0,0 +1,4 @@ +--- +- name: ensure logs directory exists + file: path=/vagrant/logs state=directory + become: true diff --git a/vagrant/ansible/roles/rethinkdb/handlers/main.yml b/vagrant/ansible/roles/rethinkdb/handlers/main.yml new file mode 100644 index 0000000..512fae0 --- /dev/null +++ b/vagrant/ansible/roles/rethinkdb/handlers/main.yml @@ -0,0 +1,4 @@ +--- +- name: restart rethinkdb + service: name=rethinkdb state=restarted + become: true diff --git a/vagrant/ansible/roles/rethinkdb/tasks/main.yml b/vagrant/ansible/roles/rethinkdb/tasks/main.yml new file mode 100644 index 0000000..7083a14 --- /dev/null +++ b/vagrant/ansible/roles/rethinkdb/tasks/main.yml @@ -0,0 +1,19 @@ +--- +- name: ensure rethinkdb apt public key is trusted + apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg + become: true +- name: ensure rethinkdb repo is in apt sources.list + apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main' + state=present + become: true +- name: ensure rethinkdb package is installed + apt: name=rethinkdb state=present + become: true + notify: + - restart rethinkdb +- name: ensure rethinkdb instance config file is installed + template: src=templates/rethinkdb-brozzler-easy.conf.j2 + dest=/etc/rethinkdb/instances.d/rethinkdb-brozzler-easy.conf + become: true + notify: + - restart rethinkdb diff --git a/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 b/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 new file mode 100644 index 0000000..bbb1099 --- /dev/null +++ b/vagrant/ansible/roles/rethinkdb/templates/rethinkdb-brozzler-easy.conf.j2 @@ -0,0 +1,5 @@ +runuser=vagrant +# bind=0.0.0.0 +# directory=/var/lib/rethinkdb +# log-file=/var/log/rethinkdb.log +log-file=/vagrant/logs/rethinkdb.log # synced dir diff --git a/vagrant/ansible/roles/warcprox/handlers/main.yml b/vagrant/ansible/roles/warcprox/handlers/main.yml new file mode 100644 index 0000000..698d871 --- /dev/null +++ b/vagrant/ansible/roles/warcprox/handlers/main.yml @@ -0,0 +1,14 @@ +--- +# - name: start warcprox +# environment: +# PYTHONPATH: /home/vagrant/warcprox-ve34/lib/python3.4/site-packages +# PATH: /home/vagrant/warcprox-ve34/bin:/usr/bin:/bin +# args: +# executable: /bin/bash +# shell: nice warcprox --dir=/vagrant/warcs --base32 --gzip +# --rollover-idle-time=180 --cacert=/vagrant/warcprox-ca.pem +# --onion-tor-socks-proxy=localhost:9050 --rethinkdb-servers=localhost +# --rethinkdb-big-table &> /vagrant/logs/warcprox.out & +- name: restart warcprox + service: name=warcprox state=restarted + become: true diff --git a/vagrant/ansible/roles/warcprox/tasks/main.yml b/vagrant/ansible/roles/warcprox/tasks/main.yml new file mode 100644 index 0000000..c9f611d --- /dev/null +++ b/vagrant/ansible/roles/warcprox/tasks/main.yml @@ -0,0 +1,25 @@ +--- +- name: ensure required packages are installed + become: true + apt: name={{item}} state=present + with_items: + - gcc + - python-virtualenv + - python3.4 + - libpython3.4-dev + - libffi-dev + - libssl-dev + - tor + - git +- name: install warcprox in virtualenv + pip: name=git+https://github.com/internetarchive/warcprox.git@2.x#egg=warcprox + virtualenv=/home/vagrant/warcprox-ve34 + virtualenv_python=python3.4 + extra_args='--no-input --upgrade --pre' + notify: + - restart warcprox +- name: install upstart config /etc/init/warcprox.conf + become: true + template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf + notify: + - restart warcprox diff --git a/vagrant/ansible/roles/warcprox/templates/warcprox.conf.j2 b/vagrant/ansible/roles/warcprox/templates/warcprox.conf.j2 new file mode 100644 index 0000000..1afccce --- /dev/null +++ b/vagrant/ansible/roles/warcprox/templates/warcprox.conf.j2 @@ -0,0 +1,26 @@ +description "warcprox" + +start on runlevel [2345] +stop on runlevel [!2345] + +env PYTHONPATH=/home/vagrant/warcprox-ve34/lib/python3.4/site-packages +env PATH=/home/vagrant/warcprox-ve34/bin:/usr/bin:/bin + +# by default warcprox creates some files/dirs relative to cwd +chdir /home/vagrant +setuid vagrant + +# console log + +# --profile +exec nice warcprox \ + --dir=/vagrant/warcs \ + --base32 \ + --gzip \ + --rollover-idle-time=180 \ + --cacert=/vagrant/warcprox-ca.pem \ + --onion-tor-socks-proxy=localhost:9050 \ + --rethinkdb-servers=localhost \ + --rethinkdb-db=brozzler \ + --rethinkdb-big-table >>/vagrant/logs/warcprox.log 2>&1 + # --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \