Merge branch 'typos' into qa

2025-08-08 06:22:23 -04:00 · 2019-05-17 17:24:19 -07:00 · 2019-05-17 17:24:19 -07:00 · 4ada3e01b7
commit 4ada3e01b7
parent 21f5af35f6 76b31a7b98
40 changed files with 469 additions and 320 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -11,19 +11,22 @@ before_install:
 - sudo pip install ansible==2.1.3.0
 install:
 - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest
+- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest==4.3.0
 - chromium-browser --version
 - sudo apt-get update
 - sudo apt-get install --only-upgrade chromium-browser
 - chromium-browser --version
- sudo service brozzler-worker restart
+- ps ww -fHe
 - sudo cat /var/log/Xvnc.log
 - sudo cat /var/log/brozzler-worker.log
 - sudo cat /var/log/warcprox.log
 script:
 - DISPLAY=:1 py.test --tb=native -v tests
 after_failure:
 - chromium-browser --version
- sudo cat /var/log/upstart/warcprox.log
+- sudo cat /var/log/warcprox.log
- sudo cat /var/log/upstart/brozzler-worker.log
+- sudo cat /var/log/brozzler-worker.log
- sudo cat /var/log/upstart/pywb.log
+- sudo cat /var/log/pywb.log
 notifications:
  slack:
    secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs=
--- a/ansible/hosts-vagrant
+++ b/ansible/hosts-vagrant
@ -1,7 +1,9 @@
 [all:vars]
 warcs_dir=/vagrant/warcs
-brozzler_pip_name='-e /brozzler'
+# brozzler_pip_name='-e /brozzler' # not working anymore? :(
 brozzler_pip_name='/brozzler'
 user=vagrant
 ansible_python_interpreter=/usr/bin/python3
 ### possible values for a prod deployment
 # brozzler_pip_name=brozzler  # get it from pypi
 # brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
--- a/ansible/roles/brozzler-dashboard/handlers/main.yml
+++ b/ansible/roles/brozzler-dashboard/handlers/main.yml
@ -1,4 +1,8 @@
 ---
 - name: restart brozzler-dashboard
-  service: name=brozzler-dashboard state=restarted
+  svc:
    name: brozzler-dashboard
    state: restarted
    service_dir: /etc/service
  become: true
--- a/ansible/roles/brozzler-dashboard/tasks/main.yml
+++ b/ansible/roles/brozzler-dashboard/tasks/main.yml
@ -1,20 +1,33 @@
 ---
- name: mkdir {{venv_root}}/brozzler-dashboard-ve34
+- name: mkdir {{venv_root}}/brozzler-dashboard-ve3
-  file: path={{venv_root}}/brozzler-dashboard-ve34 state=directory
+  file: path={{venv_root}}/brozzler-dashboard-ve3 state=directory
        owner={{user}}
  become: true
 - name: install brozzler[dashboard] in virtualenv
-  pip: name='{{brozzler_pip_name}}[dashboard]'
+  pip:
-       virtualenv={{venv_root}}/brozzler-dashboard-ve34
+    name: '{{brozzler_pip_name}}[dashboard]'
-       virtualenv_python=python3.4
+    virtualenv: '{{venv_root}}/brozzler-dashboard-ve3'
-       extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
+    virtualenv_python: python3
    virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
    extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
  become: true
  become_user: '{{user}}'
  notify:
  - restart brozzler-dashboard
- name: install upstart config /etc/init/brozzler-dashboard.conf
+
 - name: mkdir /etc/service/brozzler-dashboard
  file:
    path: /etc/service/brozzler-dashboard
    state: directory
  become: true
-  template: src=templates/brozzler-dashboard.conf.j2
+
-            dest=/etc/init/brozzler-dashboard.conf
+- name: install /etc/service/brozzler-dashboard/run
  template:
    src: templates/brozzler-dashboard-run.j2
    dest: /etc/service/brozzler-dashboard/run
    mode: 0755
  notify:
  - restart brozzler-dashboard
  become: true
--- a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2
+++ b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2
@ -0,0 +1,15 @@
 #!/bin/bash
 logfile=/var/log/brozzler-dashboard.log
 touch $logfile
 chown {{user}} $logfile
 source /opt/brozzler-dashboard-ve3/bin/activate
 exec nice setuidgid {{user}} \
    env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler \
        RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} \
        RETHINKDB_DB=brozzler LANG=en_US.UTF-8 LC_COLLATE=C \
    gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app \
    >> $logfile 2>&1
--- a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2
+++ b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2
@ -1,18 +0,0 @@
 description "brozzler-dashboard"
 start on runlevel [2345]
 stop on runlevel [!2345]
 env PYTHONPATH={{venv_root}}/brozzler-dashboard-ve34/lib/python3.4/site-packages
 env PATH={{venv_root}}/brozzler-dashboard-ve34/bin:/usr/bin:/bin
 env LC_ALL=C.UTF-8
 env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler
 env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}}
 env RETHINKDB_DB=brozzler
 setuid {{user}}
 console log
 exec gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app
--- a/ansible/roles/brozzler-worker/handlers/main.yml
+++ b/ansible/roles/brozzler-worker/handlers/main.yml
@ -1,13 +1,22 @@
 ---
 - name: restart Xvnc
-  service: name=Xvnc state=restarted
+  svc:
-  become: true
+    name: Xvnc
- name: restart websockify
+    state: restarted
-  service: name=websockify state=restarted
+    service_dir: /etc/service
  become: true
 - name: restart vnc-websock
-  service: name=vnc-websock state=restarted
+  svc:
    name: vnc-websock
    state: restarted
    service_dir: /etc/service
  become: true
 - name: restart brozzler-worker
-  service: name=brozzler-worker state=restarted
+  svc:
    name: brozzler-worker
    state: restarted
    service_dir: /etc/service
  become: true
--- a/ansible/roles/brozzler-worker/tasks/main.yml
+++ b/ansible/roles/brozzler-worker/tasks/main.yml
@ -3,14 +3,22 @@
  apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner'
                  state=present
  become: true
 - apt: update_cache=yes
  become: true
 - name: ensure required packages are installed
  become: true
  apt: name={{item}} state=present
  with_items:
  - vnc4server
  - chromium-browser
  - vnc4server
  - libjpeg-turbo8-dev
  - zlib1g-dev
  - gcc
  - python3-dev
  - python3-dbg
  - adobe-flashplugin
  - xfonts-base
  - fonts-arphic-bkai00mp
  - fonts-arphic-bsmi00lp
@ -24,51 +32,74 @@
  - fonts-sil-padauk
  - fonts-unfonts-extra
  - fonts-unfonts-core
-  - ttf-indic-fonts
+  - fonts-indic
  - fonts-thai-tlwg
  - fonts-lklug-sinhala
-  - git
+
-  - libjpeg-turbo8-dev
+- name: mkdir /etc/service/{Xvnc,vnc-websock,brozzler-worker}
-  - zlib1g-dev
+  file:
-  - gcc
+    path: '/etc/service/{{item}}'
-  - g++
+    state: directory
-  - libpython3.4-dev
+  with_items:
-  - adobe-flashplugin
+  - Xvnc
- name: install Xvnc upstart config /etc/init/Xvnc.conf
+  - vnc-websock
-  template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf
+  - brozzler-worker
  become: true
 - name: install /etc/service/Xvnc/run
  template:
    src: templates/Xvnc-run.j2
    dest: /etc/service/Xvnc/run
    mode: 0755
  notify:
  - restart Xvnc
 - name: mkdir {{venv_root}}/websockify-ve34
  become: true
-  file: path={{venv_root}}/websockify-ve34 state=directory owner={{user}}
+
 - name: mkdir {{venv_root}}/websockify-ve3
  become: true
  file: path={{venv_root}}/websockify-ve3 state=directory owner={{user}}
 - name: install websockify in virtualenv
-  pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify
+  pip:
-       virtualenv={{venv_root}}/websockify-ve34
+    name: git+https://github.com/kanaka/websockify.git#egg=websockify
-       virtualenv_python=python3.4
+    virtualenv: '{{venv_root}}/websockify-ve3'
-       extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
+    virtualenv_python: python3
    virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
    extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
  become: true
  become_user: '{{user}}'
- name: install vnc-websock upstart config /etc/init/vnc-websock.conf
+
-  template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf
+- name: install /etc/service/vnc-websock/run
-  become: true
+  template:
    src: templates/vnc-websock-run.j2
    dest: /etc/service/vnc-websock/run
    mode: 0755
  notify:
  - restart vnc-websock
 - name: mkdir {{venv_root}}/brozzler-ve34
  become: true
-  file: path={{venv_root}}/brozzler-ve34 state=directory owner={{user}}
+
 - name: mkdir {{venv_root}}/brozzler-ve3
  become: true
  file: path={{venv_root}}/brozzler-ve3 state=directory owner={{user}}
 - name: install brozzler in virtualenv
-  pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler
+  pip:
-       name='{{brozzler_pip_name}}'
+    name: '{{brozzler_pip_name}}'
-       virtualenv={{venv_root}}/brozzler-ve34
+    virtualenv: '{{venv_root}}/brozzler-ve3'
-       virtualenv_python=python3.4
+    virtualenv_python: python3
-       extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
+    virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
    extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
  become: true
  become_user: '{{user}}'
  notify:
  - restart brozzler-worker
- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf
+
-  template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf
+- name: install /etc/service/brozzler-worker/run
-  become: true
+  template:
    src: templates/brozzler-worker-run.j2
    dest: /etc/service/brozzler-worker/run
    mode: 0755
  notify:
  - restart brozzler-worker
  become: true
--- a/ansible/roles/brozzler-worker/templates/Xvnc-run.j2
+++ b/ansible/roles/brozzler-worker/templates/Xvnc-run.j2
@ -0,0 +1,14 @@
 #!/bin/bash
 cd /tmp
 logfile=/var/log/Xvnc.log
 touch $logfile
 chown {{user}} $logfile
 exec nice setuidgid {{user}} Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
    -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
    -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
    AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 \
    >> $logfile 2>&1
--- a/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2
+++ b/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2
@ -1,14 +0,0 @@
 description "Xvnc"
 start on runlevel [2345]
 stop on runlevel [!2345]
 setuid {{user}}
 console log
 exec nice Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \
    -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \
    -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \
    AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0
--- a/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2
+++ b/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2
@ -0,0 +1,17 @@
 #!/bin/bash
 logfile=/var/log/brozzler-worker.log
 touch $logfile
 chown {{user}} $logfile
 source {{venv_root}}/brozzler-ve3/bin/activate
 exec nice setuidgid {{user}} \
    env DISPLAY=:1 LANG=en_US.UTF-8 LC_COLLATE=C \
    brozzler-worker \
     --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
     --max-browsers=4 \
     --trace \
     --warcprox-auto \
     >> $logfile 2>&1
--- a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2
+++ b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2
@ -1,25 +0,0 @@
 description "brozzler-worker"
 start on runlevel [2345]
 stop on runlevel [!2345]
 env DISPLAY=:1
 env PATH={{venv_root}}/brozzler-ve34/bin:/usr/bin:/bin
 env PYTHONPATH={{venv_root}}/brozzler-ve34/lib/python3.4/site-packages
 env LANG=C.UTF-8
 setuid {{user}}
 console log
 # depends on vnc server
 start on started Xvnc
 stop on stopping Xvnc
 kill timeout 60
 exec nice brozzler-worker \
    --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \
    --max-browsers=4 \
    --verbose \
    --warcprox-auto
--- a/ansible/roles/brozzler-worker/templates/vnc-websock-run.j2
+++ b/ansible/roles/brozzler-worker/templates/vnc-websock-run.j2
@ -0,0 +1,10 @@
 #!/bin/bash
 logfile=/var/log/vnc-websock.log
 touch $logfile
 chown {{user}} $logfile
 source /opt/websockify-ve3/bin/activate
 exec nice setuidgid {{user}} websockify 0.0.0.0:8901 localhost:5901 >> $logfile 2>&1
--- a/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2
+++ b/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2
@ -1,15 +0,0 @@
 description "vnc-websock"
 start on runlevel [2345]
 stop on runlevel [!2345]
 setuid {{user}}
 console log
 env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages
 env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin
 # port 8901 is hard-coded in brozzler/dashboard/static/partials/workers.html
 exec nice websockify 0.0.0.0:8901 localhost:5901
--- a/ansible/roles/common/tasks/main.yml
+++ b/ansible/roles/common/tasks/main.yml
@ -1,44 +1,74 @@
 ---
-# get latest pip (had problems with version from apt-get, specifically
+- apt:
-# "pip install pyopenssl" did not install the dependency "cryptography")
+    name:
 # http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure
 - name: install setuptools for python 2 and 3
  become: true
  apt: name={{item}} state=present
  with_items:
  - python-setuptools
    - python3-setuptools
- name: download pip-9.0.1.tar.gz
+    - python3-pip
-  get_url:
+    - python3-virtualenv
-    url: https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz
+    - daemontools
-    dest: /tmp
+    - daemontools-run
-    checksum: sha1:57ff41e99cb01b6a1c2b0999161589b726f0ec8b
+    state: present
- name: extract pip-9.0.1.tar.gz
+    update_cache: yes
-  unarchive: src=/tmp/pip-9.0.1.tar.gz dest=/tmp copy=no
+    cache_valid_time: 86400 # one day
  become: true
 # # get recent virtualenv, which bundles a recent pip
 # - find:
 #     paths:
 #     - /usr/local/lib/python3.4/dist-packages
 #     - /usr/local/lib/python3.5/dist-packages
 #     recurse: true
 #     patterns: virtualenv.py
 #     contains: '__version__ = "16.4.3"'
 #   register: virtualenv_py_16_4_3
 # 
 # - command: mktemp -d
 #   register: mktempd_out
 #   when: virtualenv_py_16_4_3.matched == 0
 # 
 # - name: download virtualenv-16.4.3
 #   get_url:
 #     url: https://files.pythonhosted.org/packages/37/db/89d6b043b22052109da35416abc3c397655e4bd3cff031446ba02b9654fa/virtualenv-16.4.3.tar.gz
 #     dest: '{{mktempd_out.stdout}}'
 #     checksum: sha256:984d7e607b0a5d1329425dd8845bd971b957424b5ba664729fab51ab8c11bc39
 #   when: virtualenv_py_16_4_3.matched == 0
 # 
 # - name: extract virtualenv-16.4.3.tar.gz
 #   unarchive:
 #     src: '{{mktempd_out.stdout}}/virtualenv-16.4.3.tar.gz'
 #     dest: '{{mktempd_out.stdout}}'
 #     copy: no
 #   when: virtualenv_py_16_4_3.matched == 0
 # 
 # - name: run "python3 setup.py install" in {{mktempd_out.stdout}}/virtualenv-16.4.3
 #   become: true
 #   command: python3 setup.py install
 #   args:
 #     chdir: '{{mktempd_out.stdout}}/virtualenv-16.4.3'
 #   when: virtualenv_py_16_4_3.matched == 0
 # 
 # - file:
 #     path: '{{mktempd_out.stdout}}'
 #     state: absent
 #   become: true
 #   when: virtualenv_py_16_4_3.matched == 0
 # this clause is a workaround for travis-ci, which only wants to install in /usr
 # see https://travis-ci.org/internetarchive/brozzler/builds/174338601
-# but it complains that /usr/lib/python3.4/site-packages doesn't exist
+# but it complains that /usr/lib/python3.5/site-packages doesn't exist
 # see https://travis-ci.org/internetarchive/brozzler/builds/174094831
- file: path={{item}} state=directory
+- file:
    path: '{{item}}'
    state: directory
  with_items:
-  - /usr/lib/python3.4/site-packages
+  - /usr/lib/python3.5/site-packages
-  - /usr/lib/python3.4/dist-packages
+  - /usr/lib/python3.5/dist-packages
  become: true
 - name: run "python3 setup.py install" in /tmp/pip-9.0.1
  command: python3 setup.py install
           chdir=/tmp/pip-9.0.1
           creates=/usr/local/lib/python3.4/dist-packages/pip-9.0.1-py3.4.egg/pip/__init__.py
  become: true
 - name: run "pip install virtualenv"
  command: pip install virtualenv
           creates=/usr/local/lib/python3.4/dist-packages/virtualenv.py
  become: true
 - command: id {{user}}
  register: id_user
  ignore_errors: true
  changed_when: false
 - name: ensure service user {{user}} exists
  user: name={{user}} system=yes createhome=no home=/nonexistent
        shell=/usr/sbin/nologin
--- a/ansible/roles/pywb/handlers/main.yml
+++ b/ansible/roles/pywb/handlers/main.yml
@ -1,5 +1,9 @@
 ---
 - name: restart pywb
-  service: name=pywb state=restarted
+  svc:
    name: pywb
    state: restarted
    service_dir: /etc/service
  become: true
--- a/ansible/roles/pywb/tasks/main.yml
+++ b/ansible/roles/pywb/tasks/main.yml
@ -1,36 +1,52 @@
 ---
- name: mkdir {{venv_root}}/pywb-ve34
+- name: mkdir {{venv_root}}/pywb-ve3
-  file: path={{venv_root}}/pywb-ve34 state=directory
+  file: path={{venv_root}}/pywb-ve3 state=directory
        owner={{user}}
  become: true
 - name: install pywb in virtualenv
-  pip: name=pywb
+  pip:
-       version=0.33.2
+    name: pywb
-       virtualenv={{venv_root}}/pywb-ve34
+    version: 0.33.2
-       virtualenv_python=python3.4
+    virtualenv: '{{venv_root}}/pywb-ve3'
-       extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
+    virtualenv_python: python3
    virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
    extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
  become: true
  become_user: '{{user}}'
  notify:
  - restart pywb
 - name: install brozzler in pywb virtualenv
-  pip: name='{{brozzler_pip_name}}'
+  pip:
-       virtualenv={{venv_root}}/pywb-ve34
+    name: '{{brozzler_pip_name}}'
-       virtualenv_python=python3.4
+    virtualenv: '{{venv_root}}/pywb-ve3'
-       extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
+    virtualenv_python: python3
    virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
    extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
  become: true
  become_user: '{{user}}'
  notify:
  - restart pywb
 - name: pywb config file /etc/pywb.yml
  template: src=templates/pywb.yml.j2
            dest=/etc/pywb.yml
  become: true
  notify:
  - restart pywb
- name: upstart config file /etc/init/pywb.conf
+
-  template: src=templates/pywb.conf.j2
+- name: mkdir /etc/service/pywb
-            dest=/etc/init/pywb.conf
+  file:
    path: /etc/service/pywb
    state: directory
  become: true
 - name: install /etc/service/pywb/run
  template:
    src: templates/pywb-run.j2
    dest: /etc/service/pywb/run
    mode: 0755
  notify:
  - restart pywb
  become: true
--- a/ansible/roles/pywb/templates/pywb-run.j2
+++ b/ansible/roles/pywb/templates/pywb-run.j2
@ -0,0 +1,10 @@
 #!/bin/bash
 logfile=/var/log/pywb.log
 touch $logfile
 chown {{user}} $logfile
 exec nice setuidgid {{user}} env PYWB_CONFIG_FILE=/etc/pywb.yml \
    {{venv_root}}/pywb-ve3/bin/python {{venv_root}}/pywb-ve3/bin/brozzler-wayback \
    >> $logfile 2>&1
--- a/ansible/roles/pywb/templates/pywb.conf.j2
+++ b/ansible/roles/pywb/templates/pywb.conf.j2
@ -1,14 +0,0 @@
 description "pywb"
 start on runlevel [2345]
 stop on runlevel [!2345]
 env PYTHONPATH={{venv_root}}/pywb-ve34/lib/python3.4/site-packages
 env PATH={{venv_root}}/pywb-ve34/bin:/usr/bin:/bin
 env PYWB_CONFIG_FILE=/etc/pywb.yml
 setuid {{user}}
 console log
 exec nice brozzler-wayback
--- a/ansible/roles/rethinkdb/tasks/main.yml
+++ b/ansible/roles/rethinkdb/tasks/main.yml
@ -3,8 +3,9 @@
  apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg
  become: true
 - name: ensure rethinkdb repo is in apt sources.list
-  apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main'
+  apt_repository:
-                  state=present
+    repo: 'deb http://download.rethinkdb.com/apt {{ansible_lsb.codename|lower}} main'
    state: present
  become: true
 - apt: update_cache=yes
  become: true
--- a/ansible/roles/warcprox/handlers/main.yml
+++ b/ansible/roles/warcprox/handlers/main.yml
@ -1,4 +1,7 @@
 ---
 - name: restart warcprox
-  service: name=warcprox state=restarted
+  svc:
    name: warcprox
    state: restarted
    service_dir: /etc/service
  become: true
--- a/ansible/roles/warcprox/tasks/main.yml
+++ b/ansible/roles/warcprox/tasks/main.yml
@ -4,26 +4,37 @@
  apt: name={{item}} state=present
  with_items:
  - gcc
-  - python3.4
+  - python3-dev
  - libpython3.4-dev
  - libffi-dev
  - libssl-dev
  - tor
  - git
- name: mkdir {{venv_root}}/warcprox-ve34
+- name: mkdir {{venv_root}}/warcprox-ve3
  become: true
-  file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}}
+  file: path={{venv_root}}/warcprox-ve3 state=directory owner={{user}}
 - name: install warcprox in virtualenv
-  pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox
+  pip:
-       virtualenv={{venv_root}}/warcprox-ve34
+    name: git+https://github.com/internetarchive/warcprox.git#egg=warcprox
-       virtualenv_python=python3.4
+    virtualenv: '{{venv_root}}/warcprox-ve3'
-       extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
+    virtualenv_python: python3
    extra_args: --no-input --upgrade --pre --cache-dir=/tmp/pip-cache
    virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py
  become: true
  become_user: '{{user}}'
  notify:
  - restart warcprox
- name: install upstart config /etc/init/warcprox.conf
+
 - name: mkdir /etc/service/warcprox
  file:
    path: /etc/service/warcprox
    state: directory
  become: true
-  template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf
+
 - name: install /etc/service/warcprox/run
  template:
    src: templates/run.j2
    dest: /etc/service/warcprox/run
    mode: 0755
  notify:
  - restart warcprox
  become: true
--- a/ansible/roles/warcprox/templates/warcprox.conf.j2
+++ b/ansible/roles/warcprox/templates/warcprox.conf.j2
@ -1,19 +1,16 @@
-description "warcprox"
+#!/bin/bash
-start on runlevel [2345]
+logfile=/var/log/warcprox.log
-stop on runlevel [!2345]
+touch $logfile
 chown {{user}} $logfile
-env PYTHONPATH={{venv_root}}/warcprox-ve34/lib/python3.4/site-packages
+ulimit -n 4096
 env PATH={{venv_root}}/warcprox-ve34/bin:/usr/bin:/bin
-# by default warcprox creates some files/dirs relative to cwd
+cd {{work_dir}}
 chdir {{work_dir}}
 setuid {{user}}
-console log
+source {{venv_root}}/warcprox-ve3/bin/activate
-# --profile
+exec nice -n5 setuidgid {{user}} env LANG=en_US.UTF-8 LC_COLLATE=C warcprox \
 exec nice warcprox \
         --address=0.0.0.0 \
         --dir={{warcs_dir}} \
         --base32 \
@ -22,4 +19,6 @@ exec nice warcprox \
         --onion-tor-socks-proxy=localhost:9050 \
         --rethinkdb-services-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/services \
         --rethinkdb-stats-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/stats \
-         --rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures
+         --rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures \
         >> $logfile 2>&1
--- a/brozzler/init.py
+++ b/brozzler/init.py
@ -162,7 +162,7 @@ class ThreadExceptionGate:
    def queue_exception(self, e):
        with self.lock:
            if self.pending_exception:
-                self.logger.warn(
+                self.logger.warning(
                        '%r already pending for thread %r, discarding %r',
                        self.pending_exception, self.thread, e)
            else:
--- a/brozzler/chrome.py
+++ b/brozzler/chrome.py
@ -223,7 +223,7 @@ class Chrome:
                raise
            except Exception as e:
                if time.time() - self._last_warning > 30:
-                    self.logger.warn(
+                    self.logger.warning(
                            'problem with %s (will keep trying until timeout '
                            'of %d seconds): %s', json_url, timeout_sec, e)
                    self._last_warning = time.time()
@ -294,7 +294,7 @@ class Chrome:
                                'chrome pid %s exited normally',
                                self.chrome_process.pid)
                    else:
-                        self.logger.warn(
+                        self.logger.warning(
                                'chrome pid %s exited with nonzero status %s',
                                self.chrome_process.pid, status)
@ -305,13 +305,13 @@ class Chrome:
                    return
                time.sleep(0.5)
-            self.logger.warn(
+            self.logger.warning(
                    'chrome pid %s still alive %.1f seconds after sending '
                    'SIGTERM, sending SIGKILL', self.chrome_process.pid,
                    time.time() - t0)
            os.killpg(self.chrome_process.pid, signal.SIGKILL)
            status = self.chrome_process.wait()
-            self.logger.warn(
+            self.logger.warning(
                    'chrome pid %s reaped (status=%s) after killing with '
                    'SIGKILL', self.chrome_process.pid, status)
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@ -2,7 +2,7 @@
 '''
 brozzler/cli.py - brozzler command line executables
-Copyright (C) 2014-2017 Internet Archive
+Copyright (C) 2014-2019 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -606,6 +606,10 @@ def brozzler_purge(argv=None):
            '--site', dest='site', metavar='SITE_ID', help=(
                'purge crawl state from rethinkdb for a site, including all '
                'pages'))
    group.add_argument(
            '--finished-before', dest='finished_before', metavar='YYYY-MM-DD',
            help=('purge crawl state from rethinkdb for a jobs that ended '
                  'before this date'))
    arg_parser.add_argument(
            '--force', dest='force', action='store_true', help=(
                'purge even if job or site is still has status ACTIVE'))
@ -628,7 +632,7 @@ def brozzler_purge(argv=None):
            sys.exit(1)
        if job.status == 'ACTIVE':
            if args.force:
-                logging.warn(
+                logging.warning(
                        'job %s has status ACTIVE, purging anyway because '
                        '--force was supplied', job_id)
            else:
@ -645,7 +649,7 @@ def brozzler_purge(argv=None):
            sys.exit(1)
        if site.status == 'ACTIVE':
            if args.force:
-                logging.warn(
+                logging.warning(
                        'site %s has status ACTIVE, purging anyway because '
                        '--force was supplied', site_id)
            else:
@ -654,6 +658,20 @@ def brozzler_purge(argv=None):
                        '(override with --force)', site_id)
                sys.exit(1)
        _purge_site(rr, site_id)
    elif args.finished_before:
        finished_before = datetime.datetime.strptime(
                args.finished_before, '%Y-%m-%d').replace(
                        tzinfo=doublethink.UTC)
        reql = rr.table('jobs').filter(
                r.row['finished'].default(r.maxval).lt(finished_before).or_(
                    r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before)))
        logging.debug(
                'retrieving jobs older than %s: %s', finished_before, reql)
        for job in reql.run():
            # logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s',
            #         job['id'], job.get('finished'),
            #         job.get('starts_and_stops', [{'stop':None}])[-1]['stop'])
            _purge_job(rr, job['id'])
 def _purge_site(rr, site_id):
    reql = rr.table('pages').between(
@ -713,7 +731,7 @@ def brozzler_list_captures(argv=None):
    if args.url_or_sha1[:5] == 'sha1:':
        if args.prefix:
-            logging.warn(
+            logging.warning(
                    'ignoring supplied --prefix option which does not apply '
                    'to lookup by sha1')
        # assumes it's already base32 (XXX could detect if hex and convert)
--- a/brozzler/easy.py
+++ b/brozzler/easy.py
@ -260,7 +260,7 @@ class BrozzlerEasyController:
            state_strs.append(str(th))
            stack = traceback.format_stack(sys._current_frames()[th.ident])
            state_strs.append(''.join(stack))
-        logging.warn('dumping state (caught signal {})\n{}'.format(
+        logging.warning('dumping state (caught signal {})\n{}'.format(
            signum, '\n'.join(state_strs)))
 def main(argv=None):
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@ -138,7 +138,7 @@ class RethinkDbFrontier:
        sites = []
        for i in range(result["replaced"]):
            if result["changes"][i]["old_val"]["claimed"]:
-                self.logger.warn(
+                self.logger.warning(
                        "re-claimed site that was still marked 'claimed' "
                        "because it was last claimed a long time ago "
                        "at %s, and presumably some error stopped it from "
@ -225,7 +225,7 @@ class RethinkDbFrontier:
        if not job:
            return False
        if job.status.startswith("FINISH"):
-            self.logger.warn("%s is already %s", job, job.status)
+            self.logger.warning("%s is already %s", job, job.status)
            return True
        results = self.rr.table("sites").get_all(job_id, index="job_id").run()
@ -415,7 +415,7 @@ class RethinkDbFrontier:
        assert isinstance(e, brozzler.ReachedLimit)
        if (site.reached_limit
                and site.reached_limit != e.warcprox_meta["reached-limit"]):
-            self.logger.warn(
+            self.logger.warning(
                    "reached limit %s but site had already reached limit %s",
                    e.warcprox_meta["reached-limit"], self.reached_limit)
        else:
@ -434,7 +434,7 @@ class RethinkDbFrontier:
                index="priority_by_site").filter({"hops_from_seed":0}).run()
        pages = list(results)
        if len(pages) > 1:
-            self.logger.warn(
+            self.logger.warning(
                    "more than one seed page for site_id %s ?", site_id)
        if len(pages) < 1:
            return None
--- a/brozzler/robots.py
+++ b/brozzler/robots.py
@ -106,7 +106,7 @@ def is_permitted_by_robots(site, url, proxy=None):
            # reppy has wrapped an exception that we want to bubble up
            raise brozzler.ProxyError(e)
        else:
-            logging.warn(
+            logging.warning(
                    "returning true (permitted) after problem fetching "
                    "robots.txt for %r: %r", url, e)
            return True
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@ -147,13 +147,13 @@ class BrozzlerWorker:
        try:
            with urllib.request.urlopen(request, timeout=600) as response:
                if response.getcode() != 204:
-                    self.logger.warn(
+                    self.logger.warning(
                            'got "%s %s" response on warcprox '
                            'WARCPROX_WRITE_RECORD request (expected 204)',
                            response.getcode(), response.reason)
                return request, response
        except urllib.error.HTTPError as e:
-            self.logger.warn(
+            self.logger.warning(
                    'got "%s %s" response on warcprox '
                    'WARCPROX_WRITE_RECORD request (expected 204)',
                    e.getcode(), e.info())
@ -370,7 +370,7 @@ class BrozzlerWorker:
                if (page.needs_robots_check and
                        not brozzler.is_permitted_by_robots(
                            site, page.url, self._proxy_for(site))):
-                    logging.warn("page %s is blocked by robots.txt", page.url)
+                    logging.warning("page %s is blocked by robots.txt", page.url)
                    page.blocked_by_robots = True
                    self._frontier.completed_page(site, page)
                else:
@ -544,7 +544,7 @@ class BrozzlerWorker:
    def start(self):
        with self._start_stop_lock:
            if self._thread:
-                self.logger.warn(
+                self.logger.warning(
                        'ignoring start request because self._thread is '
                        'not None')
                return
--- a/brozzler/ydl.py
+++ b/brozzler/ydl.py
@ -48,7 +48,7 @@ _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_rea
 def _webpage_read_content(self, *args, **kwargs):
    content = _orig_webpage_read_content(self, *args, **kwargs)
    if len(content) > 20000000:
-        logging.warn(
+        logging.warning(
                'bypassing youtube-dl extraction because content is '
                'too large (%s characters)', len(content))
        return ''
@ -185,7 +185,7 @@ def _build_youtube_dl(worker, destdir, site):
                    mimetype = magic.from_file(ctx['filename'], mime=True)
                except ImportError as e:
                    mimetype = 'video/%s' % info_dict['ext']
-                    self.logger.warn(
+                    self.logger.warning(
                            'guessing mimetype %s because %r', mimetype, e)
            url = 'youtube-dl:%05d:%s' % (
--- a/job-conf.rst
+++ b/job-conf.rst
@ -339,12 +339,12 @@ Brozzler derives its general approach to the seed surt from `heritrix
   slash.
 2. Canonicalization does not attempt to match heritrix exactly, though it
   usually does match.
-3. When generating a SURT for an HTTPS URL, heritrix changes the scheme to
+3. Brozzler does no scheme munging. (When generating a SURT for an HTTPS URL,
-   HTTP. For example, the heritrix SURT for ``https://www.example.com/`` is
+   heritrix changes the scheme to HTTP. For example, the heritrix SURT for
-   ``http://(com,example,www,)`` and this means that all of
+   ``https://www.example.com/`` is ``http://(com,example,www,)`` and this means
-   ``http://www.example.com/*`` and ``https://www.example.com/*`` are in
+   that all of ``http://www.example.com/*`` and ``https://www.example.com/*``
-   scope. It also means that a manually specified SURT with scheme "https" does
+   are in scope. It also means that a manually specified SURT with scheme
-   not match anything. Brozzler does no scheme munging.
+   "https" does not match anything.)
 4. Brozzler identifies seed "redirects" by retrieving the URL from the
   browser's location bar at the end of brozzling the seed page, whereas
   heritrix follows HTTP 3XX redirects. If the URL in the browser
--- a/setup.py
+++ b/setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
 setuptools.setup(
        name='brozzler',
-        version='1.5.4',
+        version='1.5.6',
        description='Distributed web crawling with browsers',
        url='https://github.com/internetarchive/brozzler',
        author='Noah Levitt',
@ -64,10 +64,10 @@ setuptools.setup(
            ],
        },
        install_requires=[
-            'PyYAML>=3.12',
+            'PyYAML>=5.1',
            'youtube-dl>=2018.7.21',
            'reppy==0.3.4',
-            'requests>=2.18.4',
+            'requests>=2.21',
            'websocket-client>=0.39.0,<=0.48.0',
            'pillow>=5.2.0',
            'urlcanon>=0.1.dev23',
@ -80,13 +80,13 @@ setuptools.setup(
        ],
        extras_require={
            'dashboard': [
-                'flask>=0.11',
+                'flask>=1.0',
                'gunicorn>=19.8.1'
            ],
            'easy': [
                'warcprox>=2.4b2.dev173',
                'pywb>=0.33.2,<2',
-                'flask>=0.11',
+                'flask>=1.0',
                'gunicorn>=19.8.1'
            ],
        },
--- a/tests/test_brozzling.py
+++ b/tests/test_brozzling.py
@ -67,8 +67,8 @@ def httpd(request):
                self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"')
                self.send_header('Content-type', 'text/html')
                self.end_headers()
-                self.wfile.write(self.headers.getheader('Authorization'))
+                self.wfile.write(self.headers.get('Authorization', b''))
-                self.wfile.write('not authenticated')
+                self.wfile.write(b'not authenticated')
            else:
                super().do_GET()
--- a/tests/test_cluster.py
+++ b/tests/test_cluster.py
@ -34,16 +34,41 @@ import http.server
 import logging
 import warcprox
 # https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib
 def _local_address():
    import socket
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    try:
        s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable
        return s.getsockname()[0]
    except:
        return '127.0.0.1'
    finally:
        s.close()
 local_address = _local_address()
 def start_service(service):
-    subprocess.check_call(['sudo', 'service', service, 'start'])
+    subprocess.check_call(['sudo', 'svc', '-u', '/etc/service/' + service])
 def stop_service(service):
-    subprocess.check_call(['sudo', 'service', service, 'stop'])
+    subprocess.check_call(['sudo', 'svc', '-d', '/etc/service/' + service])
    while True:
        status = subprocess.check_output(
                ['sudo', 'svstat', '/etc/service/' + service])
        if b' down ' in status:
            break
        time.sleep(0.5)
@pytest.fixture(scope='module')
 def httpd(request):
    class RequestHandler(http.server.SimpleHTTPRequestHandler):
        def do_POST(self):
            logging.info('\n%s\n%s', self.requestline, self.headers)
            self.do_GET()
        def do_GET(self):
            logging.info('\n%s\n%s', self.requestline, self.headers)
            if self.path == '/site5/redirect/':
                self.send_response(303, 'See other')
                self.send_header('Connection', 'close')
@ -82,7 +107,7 @@ def httpd(request):
    # SimpleHTTPRequestHandler always uses CWD so we have to chdir
    os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs'))
-    httpd = http.server.HTTPServer(('localhost', 0), RequestHandler)
+    httpd = http.server.HTTPServer((local_address, 0), RequestHandler)
    httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever)
    httpd_thread.start()
@ -94,6 +119,9 @@ def httpd(request):
    return httpd
 def make_url(httpd, rel_url):
    return 'http://%s:%s%s' % (local_address, httpd.server_port, rel_url)
 def test_httpd(httpd):
    '''
    Tests that our http server is working as expected, and that two fetches
@ -101,7 +129,7 @@ def test_httpd(httpd):
    deduplication.
    '''
    payload1 = content2 = None
-    url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
+    url = make_url(httpd, '/site1/file1.txt')
    with urllib.request.urlopen(url) as response:
        assert response.status == 200
        payload1 = response.read()
@ -140,13 +168,13 @@ def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
+        'seed': make_url(httpd, '/site1/'),
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    # the two pages we expect to be crawled
-    page1 = 'http://localhost:%s/site1/' % httpd.server_port
+    page1 = make_url(httpd, '/site1/')
-    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
+    page2 = make_url(httpd, '/site1/file1.txt')
-    robots = 'http://localhost:%s/robots.txt' % httpd.server_port
+    robots = make_url(httpd, '/robots.txt')
    # so we can examine rethinkdb before it does anything
    try:
@ -171,8 +199,7 @@ def test_brozzle_site(httpd):
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
-            'http://localhost:%s/site1/' % httpd.server_port,
+            make_url(httpd, '/site1/'), make_url(httpd, '/site1/file1.txt')}
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}
    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
@ -255,8 +282,8 @@ def test_proxy_non_warcprox(httpd):
        start_service('brozzler-worker')
    assert len(proxy.requests) <= 15
    assert proxy.requests.count('GET /status') == 1
-    assert ('GET http://localhost:%s/site1/' % httpd.server_port) in proxy.requests
+    assert ('GET %s' % make_url(httpd, '/site1/')) in proxy.requests
-    assert ('GET http://localhost:%s/site1/file1.txt' % httpd.server_port) in proxy.requests
+    assert ('GET %s' % make_url(httpd, '/site1/file1.txt')) in proxy.requests
    assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == []
    proxy.shutdown()
@ -292,14 +319,14 @@ def _test_proxy_setting(
            datetime.datetime.utcnow().isoformat())
    # the two pages we expect to be crawled
-    page1 = 'http://localhost:%s/site1/' % httpd.server_port
+    page1 =  make_url(httpd, '/site1/')
-    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
+    page2 =  make_url(httpd, '/site1/file1.txt')
-    robots = 'http://localhost:%s/robots.txt' % httpd.server_port
+    robots = make_url(httpd, '/robots.txt')
    rr = doublethink.Rethinker('localhost', db='brozzler')
    service_registry = doublethink.ServiceRegistry(rr)
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
+        'seed': make_url(httpd, '/site1/'),
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.id is None
    frontier = brozzler.RethinkDbFrontier(rr)
@ -332,8 +359,8 @@ def _test_proxy_setting(
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
-            'http://localhost:%s/site1/' % httpd.server_port,
+            make_url(httpd, '/site1/'),
-            'http://localhost:%s/site1/file1.txt' % httpd.server_port}
+            make_url(httpd, '/site1/file1.txt')}
    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
@ -360,7 +387,7 @@ def test_obey_robots(httpd):
    test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
+        'seed': make_url(httpd, '/site1/'),
        'user_agent': 'im a badbot',   # robots.txt blocks badbot
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -390,12 +417,12 @@ def test_obey_robots(httpd):
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 1
    page = pages[0]
-    assert page.url == 'http://localhost:%s/site1/' % httpd.server_port
+    assert page.url == make_url(httpd, '/site1/')
    assert page.blocked_by_robots
    # take a look at the captures table
    time.sleep(2)   # in case warcprox hasn't finished processing urls
-    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
+    robots_url = make_url(httpd, '/robots.txt')
    captures = list(rr.table('captures').filter({'test_id':test_id}).run())
    assert len(captures) == 1
    assert captures[0]['url'] == robots_url
@ -412,7 +439,7 @@ def test_login(httpd):
    test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/site2/' % httpd.server_port,
+        'seed': make_url(httpd, '/site2/'),
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}},
        'username': 'test_username', 'password': 'test_password'})
@ -428,7 +455,7 @@ def test_login(httpd):
    # take a look at the captures table
    time.sleep(2)   # in case warcprox hasn't finished processing urls
-    robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port
+    robots_url = make_url(httpd, '/robots.txt')
    captures = list(rr.table('captures').filter(
                {'test_id':test_id}).order_by('timestamp').run())
    meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures]
@ -436,25 +463,25 @@ def test_login(httpd):
    # there are several forms in in htdocs/site2/login.html but only one
    # that brozzler's heuristic should match and try to submit, and it has
    # action='00', so we can check for that here
-    assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url
+    assert ('POST %s' % make_url(httpd, '/site2/00')) in meth_url
    # sanity check the rest of the crawl
-    assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url
+    assert ('GET %s' % make_url(httpd, '/robots.txt')) in meth_url
-    assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url
+    assert ('GET %s' % make_url(httpd, '/site2/')) in meth_url
-    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url
+    assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/')) in meth_url
-    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url
+    assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/')) in meth_url
-    assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
+    assert ('GET %s' % make_url(httpd, '/site2/login.html')) in meth_url
-    assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
+    assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/login.html')) in meth_url
-    assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
+    assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/login.html')) in meth_url
 def test_seed_redirect(httpd):
    test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
-    seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port
+    seed_url = make_url(httpd, '/site5/redirect/')
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port,
+        'seed': make_url(httpd, '/site5/redirect/'),
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
-    assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]}
+    assert site.scope == {'accepts': [{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}]}
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
@ -473,19 +500,19 @@ def test_seed_redirect(httpd):
    pages.sort(key=lambda page: page.hops_from_seed)
    assert pages[0].hops_from_seed == 0
    assert pages[0].url == seed_url
-    assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port
+    assert pages[0].redirect_url == make_url(httpd, '/site5/destination/')
    assert pages[1].hops_from_seed == 1
-    assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port
+    assert pages[1].url == make_url(httpd, '/site5/destination/page2.html')
    # check that scope has been updated properly
    assert site.scope == {'accepts': [
-        {'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port},
+        {'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)},
-        {'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}
+        {'ssurt': '%s//%s:http:/site5/destination/' % (local_address, httpd.server_port)}]}
 def test_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
-    seed_url = 'http://localhost:%s/site7/' % httpd.server_port
+    seed_url = make_url(httpd, '/site7/')
    site = brozzler.Site(rr, {
        'seed': seed_url,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -507,9 +534,9 @@ def test_hashtags(httpd):
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
-    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port]
+    assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site7/foo.html')]
    assert not pages[0].hashtags
-    assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port
+    assert pages[1].url == make_url(httpd, '/site7/foo.html')
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',]
@ -520,18 +547,18 @@ def test_hashtags(httpd):
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    assert seed_url in captures_by_url
-    assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert make_url(httpd, '/site7/foo.html') in captures_by_url
-    assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url
+    assert make_url(httpd, '/site7/whee.txt') in captures_by_url
-    assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url
+    assert make_url(httpd, '/site7/boosh.txt') in captures_by_url
    assert 'screenshot:%s' % seed_url in captures_by_url
    assert 'thumbnail:%s' % seed_url in captures_by_url
-    assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'screenshot:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
-    assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
+    assert 'thumbnail:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
 def test_redirect_hashtags(httpd):
    test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat()
    rr = doublethink.Rethinker('localhost', db='brozzler')
-    seed_url = 'http://localhost:%s/site9/' % httpd.server_port
+    seed_url = make_url(httpd, '/site9/')
    site = brozzler.Site(rr, {
        'seed': seed_url,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
@ -553,9 +580,9 @@ def test_redirect_hashtags(httpd):
    assert pages[0].url == seed_url
    assert pages[0].hops_from_seed == 0
    assert pages[0].brozzle_count == 1
-    assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port]
+    assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site9/redirect.html')]
    assert not pages[0].hashtags
-    assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port
+    assert pages[1].url == make_url(httpd, '/site9/redirect.html')
    assert pages[1].hops_from_seed == 1
    assert pages[1].brozzle_count == 1
    assert sorted(pages[1].hashtags) == ['#hash1','#hash2',]
@ -563,7 +590,7 @@ def test_redirect_hashtags(httpd):
    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
-    redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET']
+    redirect_captures = [c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET']
    assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
    # === expected captures ===
@ -589,9 +616,9 @@ def test_stop_crawl(httpd):
    # create a new job with three sites that could be crawled forever
    job_conf = {'seeds': [
-        {'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port},
+        {'url': make_url(httpd, '/infinite/foo/')},
-        {'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port},
+        {'url': make_url(httpd, '/infinite/bar/')},
-        {'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]}
+        {'url': make_url(httpd, '/infinite/baz/')}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
@ -675,7 +702,7 @@ def test_warcprox_outage_resiliency(httpd):
    # put together a site to crawl
    test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/infinite/' % httpd.server_port,
+        'seed': make_url(httpd, '/infinite/'),
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    try:
@ -684,7 +711,7 @@ def test_warcprox_outage_resiliency(httpd):
        try:
            stop_service('warcprox')
        except Exception as e:
-            logging.warn('problem stopping warcprox service: %s', e)
+            logging.warning('problem stopping warcprox service: %s', e)
        # queue the site for brozzling
        brozzler.new_site(frontier, site)
@ -771,7 +798,7 @@ def test_time_limit(httpd):
    # create a new job with one seed that could be crawled forever
    job_conf = {'seeds': [{
-        'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port,
+        'url': make_url(httpd, '/infinite/foo/'),
        'time_limit': 20}]}
    job = brozzler.new_job(frontier, job_conf)
    assert job.id
@ -801,7 +828,7 @@ def test_ydl_stitching(httpd):
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    site = brozzler.Site(rr, {
-        'seed': 'http://localhost:%s/site10/' % httpd.server_port,
+        'seed': make_url(httpd, '/site10/'),
        'warcprox_meta':  {
            'warc-prefix': 'test_ydl_stitching',
            'captures-table-extra-fields': {'test_id':test_id}}})
@ -819,7 +846,7 @@ def test_ydl_stitching(httpd):
    assert len(pages) == 1
    page = pages[0]
    assert len(page.videos) == 6
-    stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port
+    stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/')
    assert {
        'blame': 'youtube-dl',
        'content-length': 267900,
--- a/vagrant/README.rst
+++ b/vagrant/README.rst
@ -24,27 +24,27 @@ the brozzler virtualenv.
 ::
    my-laptop$ vagrant ssh
-    vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate
+    vagrant@brzl:~$ source /opt/brozzler-ve3/bin/activate
-    (brozzler-ve34)vagrant@brzl:~$
+    (brozzler-ve3)vagrant@brzl:~$
 Then you can run brozzler-new-site:
 ::
-    (brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
+    (brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
 Or brozzler-new-job (make sure to set the proxy to localhost:8000):
 ::
-    (brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <<EOF
+    (brozzler-ve3)vagrant@brzl:~$ cat >job1.yml <<EOF
    id: job1
    proxy: localhost:8000 # point at warcprox for archiving
    seeds:
    - url: https://example.org/
    EOF
-    (brozzler-ve34)vagrant@brzl:~$ brozzler-new-job job1.yml
+    (brozzler-ve3)vagrant@brzl:~$ brozzler-new-job job1.yml
 WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
 ./logs (via vagrant folders syncing).
--- a/vagrant/Vagrantfile
+++ b/vagrant/Vagrantfile
@ -1,8 +1,9 @@
 Vagrant.configure(2) do |config|
-  config.vm.box = "ubuntu/trusty64"
+  config.vm.box = "ubuntu/xenial64"
  config.vm.define "10.9.9.9"
  config.vm.hostname = "brzl"
  config.vm.network :private_network, ip: "10.9.9.9"
  config.disksize.size = '50GB'
  config.vm.synced_folder "..", "/brozzler"
@ -14,6 +15,7 @@ Vagrant.configure(2) do |config|
  config.vm.provision "ansible" do |ansible|
    ansible.inventory_path = "../ansible/hosts-vagrant"
    ansible.playbook = "../ansible/playbook.yml"
    # ansible.verbose = "-vvv"
  end
  config.vm.provider 'virtualbox' do |v|
--- a/vagrant/run-tests.sh
+++ b/vagrant/run-tests.sh
@ -10,12 +10,12 @@ cd $(dirname "${BASH_SOURCE[0]}")
 vagrant up
 echo service status:
-vagrant ssh -- 'status warcprox ;
+vagrant ssh -- 'sudo svstat /etc/service/warcprox ;
-                status Xvnc ;
+                sudo svstat /etc/service/Xvnc ;
-                status brozzler-worker ;
+                sudo svstat /etc/service/brozzler-worker ;
-                status brozzler-dashboard ;
+                sudo svstat /etc/service/brozzler-dashboard ;
-                status vnc-websock'
+                sudo svstat /etc/service/vnc-websock'
 echo
-vagrant ssh -- 'set -x ; source /opt/brozzler-ve34/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"'
+vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest==4.3.0 && pip install --upgrade --pre "warcprox>=2.1b1.dev86"'
-vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@"
+vagrant ssh -- "source /opt/brozzler-ve3/bin/activate && DISPLAY=:1 py.test --tb=native -v /brozzler/tests $@"
--- a/vagrant/vagrant-brozzler-new-job.py
+++ b/vagrant/vagrant-brozzler-new-job.py
@ -7,7 +7,7 @@ This is a standalone script with no dependencies other than python, and should
 work with python 2.7 or python 3.2+. The only reason it's not a bash script is
 so we can use the argparse library.
-Copyright (C) 2016 Internet Archive
+Copyright (C) 2016-2019 Internet Archive
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -41,9 +41,8 @@ def main(argv=[]):
        subprocess.call([
            'vagrant', 'ssh', '--',
            'f=`mktemp` && cat > $f && '
-            'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
+            '/home/vagrant/brozzler-ve3/bin/python '
-            '/home/vagrant/brozzler-ve34/bin/python '
+            '/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'],
            '/home/vagrant/brozzler-ve34/bin/brozzler-new-job $f'],
            stdin=f)
 if __name__ == '__main__':
--- a/vagrant/vagrant-brozzler-new-site.py
+++ b/vagrant/vagrant-brozzler-new-site.py
@ -74,11 +74,8 @@ def main(argv=[]):
    os.chdir(os.path.dirname(__file__))
    cmd = (
-        'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages '
+        '/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site '
-        '/home/vagrant/brozzler-ve34/bin/python '
+        '%s %s') % (' '.join(options), args.seed)
        '/home/vagrant/brozzler-ve34/bin/brozzler-new-site '
        '--proxy=localhost:8000 %s %s') % (
                ' '.join(options), args.seed)
    subprocess.call(['vagrant', 'ssh', '--', cmd])
 if __name__ == '__main__':