From d4f8bc768f87666d68562e5547949e57a952690c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 18 Mar 2019 16:38:23 -0700 Subject: [PATCH 01/31] trying to make this work with xenial for travis see error https://travis-ci.org/internetarchive/brozzler/jobs/508141058 --- ansible/roles/rethinkdb/tasks/main.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ansible/roles/rethinkdb/tasks/main.yml b/ansible/roles/rethinkdb/tasks/main.yml index 774520b..b24414e 100644 --- a/ansible/roles/rethinkdb/tasks/main.yml +++ b/ansible/roles/rethinkdb/tasks/main.yml @@ -3,8 +3,9 @@ apt_key: url=http://download.rethinkdb.com/apt/pubkey.gpg become: true - name: ensure rethinkdb repo is in apt sources.list - apt_repository: repo='deb http://download.rethinkdb.com/apt trusty main' - state=present + apt_repository: + repo: 'deb http://download.rethinkdb.com/apt {{ansible_lsb.codename|lower}} main' + state: present become: true - apt: update_cache=yes become: true From 19522aff85bb3ffc092106d37b034fe439752086 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 19 Mar 2019 16:37:13 -0700 Subject: [PATCH 02/31] adjusting ansible config for xenial untested because of vagrant problems --- ansible/hosts-vagrant | 4 +- .../roles/brozzler-dashboard/tasks/main.yml | 8 ++-- .../templates/brozzler-dashboard.conf.j2 | 3 +- ansible/roles/brozzler-worker/tasks/main.yml | 39 +++++++++---------- .../templates/brozzler-worker.conf.j2 | 3 +- .../templates/vnc-websock.conf.j2 | 3 +- ansible/roles/common/tasks/main.yml | 10 ++--- ansible/roles/pywb/tasks/main.yml | 12 +++--- ansible/roles/pywb/templates/pywb.conf.j2 | 4 +- ansible/roles/warcprox/tasks/main.yml | 11 +++--- .../roles/warcprox/templates/warcprox.conf.j2 | 5 +-- 11 files changed, 47 insertions(+), 55 deletions(-) diff --git a/ansible/hosts-vagrant b/ansible/hosts-vagrant index b5a6604..7b27a61 100644 --- a/ansible/hosts-vagrant +++ b/ansible/hosts-vagrant @@ -1,7 +1,9 @@ [all:vars] warcs_dir=/vagrant/warcs -brozzler_pip_name='-e /brozzler' +# brozzler_pip_name='-e /brozzler' # not working anymore? :( +brozzler_pip_name='/brozzler' user=vagrant +ansible_python_interpreter=/usr/bin/python3 ### possible values for a prod deployment # brozzler_pip_name=brozzler # get it from pypi # brozzler_pip_name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler diff --git a/ansible/roles/brozzler-dashboard/tasks/main.yml b/ansible/roles/brozzler-dashboard/tasks/main.yml index 42a7551..b341d1a 100644 --- a/ansible/roles/brozzler-dashboard/tasks/main.yml +++ b/ansible/roles/brozzler-dashboard/tasks/main.yml @@ -1,12 +1,12 @@ --- -- name: mkdir {{venv_root}}/brozzler-dashboard-ve34 - file: path={{venv_root}}/brozzler-dashboard-ve34 state=directory +- name: mkdir {{venv_root}}/brozzler-dashboard-ve3 + file: path={{venv_root}}/brozzler-dashboard-ve3 state=directory owner={{user}} become: true - name: install brozzler[dashboard] in virtualenv pip: name='{{brozzler_pip_name}}[dashboard]' - virtualenv={{venv_root}}/brozzler-dashboard-ve34 - virtualenv_python=python3.4 + virtualenv={{venv_root}}/brozzler-dashboard-ve3 + virtualenv_python=python3 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' diff --git a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 index cd8e8e0..7a8f0bb 100644 --- a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 +++ b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 @@ -3,8 +3,7 @@ description "brozzler-dashboard" start on runlevel [2345] stop on runlevel [!2345] -env PYTHONPATH={{venv_root}}/brozzler-dashboard-ve34/lib/python3.4/site-packages -env PATH={{venv_root}}/brozzler-dashboard-ve34/bin:/usr/bin:/bin +env PATH={{venv_root}}/brozzler-dashboard-ve3/bin:/usr/bin:/bin env LC_ALL=C.UTF-8 env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler diff --git a/ansible/roles/brozzler-worker/tasks/main.yml b/ansible/roles/brozzler-worker/tasks/main.yml index deb7a92..bd9512a 100644 --- a/ansible/roles/brozzler-worker/tasks/main.yml +++ b/ansible/roles/brozzler-worker/tasks/main.yml @@ -9,8 +9,14 @@ become: true apt: name={{item}} state=present with_items: - - vnc4server - chromium-browser + - vnc4server + - libjpeg-turbo8-dev + - zlib1g-dev + - gcc + - python3-dev + - python3-dbg + - adobe-flashplugin - xfonts-base - fonts-arphic-bkai00mp - fonts-arphic-bsmi00lp @@ -24,28 +30,21 @@ - fonts-sil-padauk - fonts-unfonts-extra - fonts-unfonts-core - - ttf-indic-fonts + - fonts-indic - fonts-thai-tlwg - fonts-lklug-sinhala - - git - - libjpeg-turbo8-dev - - zlib1g-dev - - gcc - - g++ - - libpython3.4-dev - - adobe-flashplugin - name: install Xvnc upstart config /etc/init/Xvnc.conf template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf become: true notify: - restart Xvnc -- name: mkdir {{venv_root}}/websockify-ve34 +- name: mkdir {{venv_root}}/websockify-ve3 become: true - file: path={{venv_root}}/websockify-ve34 state=directory owner={{user}} + file: path={{venv_root}}/websockify-ve3 state=directory owner={{user}} - name: install websockify in virtualenv pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify - virtualenv={{venv_root}}/websockify-ve34 - virtualenv_python=python3.4 + virtualenv={{venv_root}}/websockify-ve3 + virtualenv_python=python3 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' @@ -54,15 +53,15 @@ become: true notify: - restart vnc-websock -- name: mkdir {{venv_root}}/brozzler-ve34 +- name: mkdir {{venv_root}}/brozzler-ve3 become: true - file: path={{venv_root}}/brozzler-ve34 state=directory owner={{user}} + file: path={{venv_root}}/brozzler-ve3 state=directory owner={{user}} - name: install brozzler in virtualenv - pip: # name=git+https://github.com/internetarchive/brozzler.git#egg=brozzler - name='{{brozzler_pip_name}}' - virtualenv={{venv_root}}/brozzler-ve34 - virtualenv_python=python3.4 - extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' + pip: + name: '{{brozzler_pip_name}}' + virtualenv: '{{venv_root}}/brozzler-ve3' + virtualenv_python: python3 + extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' notify: diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 index 3fd73d6..5b9f711 100644 --- a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 +++ b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 @@ -4,8 +4,7 @@ start on runlevel [2345] stop on runlevel [!2345] env DISPLAY=:1 -env PATH={{venv_root}}/brozzler-ve34/bin:/usr/bin:/bin -env PYTHONPATH={{venv_root}}/brozzler-ve34/lib/python3.4/site-packages +env PATH={{venv_root}}/brozzler-ve3/bin:/usr/bin:/bin env LANG=C.UTF-8 setuid {{user}} diff --git a/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 b/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 index 2468bae..a26345d 100644 --- a/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 +++ b/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 @@ -7,8 +7,7 @@ setuid {{user}} console log -env PYTHONPATH={{venv_root}}/websockify-ve34/lib/python3.4/site-packages -env PATH={{venv_root}}/websockify-ve34/bin:/usr/bin:/bin +env PATH={{venv_root}}/websockify-ve3/bin:/usr/bin:/bin # port 8901 is hard-coded in brozzler/dashboard/static/partials/workers.html exec nice websockify 0.0.0.0:8901 localhost:5901 diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml index 5942b86..2167ba1 100644 --- a/ansible/roles/common/tasks/main.yml +++ b/ansible/roles/common/tasks/main.yml @@ -18,22 +18,22 @@ # this clause is a workaround for travis-ci, which only wants to install in /usr # see https://travis-ci.org/internetarchive/brozzler/builds/174338601 -# but it complains that /usr/lib/python3.4/site-packages doesn't exist +# but it complains that /usr/lib/python3.5/site-packages doesn't exist # see https://travis-ci.org/internetarchive/brozzler/builds/174094831 - file: path={{item}} state=directory with_items: - - /usr/lib/python3.4/site-packages - - /usr/lib/python3.4/dist-packages + - /usr/lib/python3.5/site-packages + - /usr/lib/python3.5/dist-packages become: true - name: run "python3 setup.py install" in /tmp/pip-9.0.1 command: python3 setup.py install chdir=/tmp/pip-9.0.1 - creates=/usr/local/lib/python3.4/dist-packages/pip-9.0.1-py3.4.egg/pip/__init__.py + creates=/usr/local/lib/python3.5/dist-packages/pip-9.0.1-py3.5.egg/pip/__init__.py become: true - name: run "pip install virtualenv" command: pip install virtualenv - creates=/usr/local/lib/python3.4/dist-packages/virtualenv.py + creates=/usr/local/lib/python3.5/dist-packages/virtualenv.py become: true - command: id {{user}} register: id_user diff --git a/ansible/roles/pywb/tasks/main.yml b/ansible/roles/pywb/tasks/main.yml index 16b9ea7..7ffe49c 100644 --- a/ansible/roles/pywb/tasks/main.yml +++ b/ansible/roles/pywb/tasks/main.yml @@ -1,13 +1,13 @@ --- -- name: mkdir {{venv_root}}/pywb-ve34 - file: path={{venv_root}}/pywb-ve34 state=directory +- name: mkdir {{venv_root}}/pywb-ve3 + file: path={{venv_root}}/pywb-ve3 state=directory owner={{user}} become: true - name: install pywb in virtualenv pip: name=pywb version=0.33.2 - virtualenv={{venv_root}}/pywb-ve34 - virtualenv_python=python3.4 + virtualenv={{venv_root}}/pywb-ve3 + virtualenv_python=python3 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' @@ -15,8 +15,8 @@ - restart pywb - name: install brozzler in pywb virtualenv pip: name='{{brozzler_pip_name}}' - virtualenv={{venv_root}}/pywb-ve34 - virtualenv_python=python3.4 + virtualenv={{venv_root}}/pywb-ve3 + virtualenv_python=python3 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' diff --git a/ansible/roles/pywb/templates/pywb.conf.j2 b/ansible/roles/pywb/templates/pywb.conf.j2 index c2cc89e..6b3450c 100644 --- a/ansible/roles/pywb/templates/pywb.conf.j2 +++ b/ansible/roles/pywb/templates/pywb.conf.j2 @@ -3,12 +3,10 @@ description "pywb" start on runlevel [2345] stop on runlevel [!2345] -env PYTHONPATH={{venv_root}}/pywb-ve34/lib/python3.4/site-packages -env PATH={{venv_root}}/pywb-ve34/bin:/usr/bin:/bin env PYWB_CONFIG_FILE=/etc/pywb.yml setuid {{user}} console log -exec nice brozzler-wayback +exec nice {{venv_root}}/pywb-ve3/bin/python {{venv_root}}/pywb-ve3/bin/brozzler-wayback diff --git a/ansible/roles/warcprox/tasks/main.yml b/ansible/roles/warcprox/tasks/main.yml index be53dc5..74e8c59 100644 --- a/ansible/roles/warcprox/tasks/main.yml +++ b/ansible/roles/warcprox/tasks/main.yml @@ -4,19 +4,18 @@ apt: name={{item}} state=present with_items: - gcc - - python3.4 - - libpython3.4-dev + - python3-dev - libffi-dev - libssl-dev - tor - git -- name: mkdir {{venv_root}}/warcprox-ve34 +- name: mkdir {{venv_root}}/warcprox-ve3 become: true - file: path={{venv_root}}/warcprox-ve34 state=directory owner={{user}} + file: path={{venv_root}}/warcprox-ve3 state=directory owner={{user}} - name: install warcprox in virtualenv pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox - virtualenv={{venv_root}}/warcprox-ve34 - virtualenv_python=python3.4 + virtualenv={{venv_root}}/warcprox-ve3 + virtualenv_python=python3 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' diff --git a/ansible/roles/warcprox/templates/warcprox.conf.j2 b/ansible/roles/warcprox/templates/warcprox.conf.j2 index 4d9a33d..61f36ba 100644 --- a/ansible/roles/warcprox/templates/warcprox.conf.j2 +++ b/ansible/roles/warcprox/templates/warcprox.conf.j2 @@ -3,9 +3,6 @@ description "warcprox" start on runlevel [2345] stop on runlevel [!2345] -env PYTHONPATH={{venv_root}}/warcprox-ve34/lib/python3.4/site-packages -env PATH={{venv_root}}/warcprox-ve34/bin:/usr/bin:/bin - # by default warcprox creates some files/dirs relative to cwd chdir {{work_dir}} setuid {{user}} @@ -13,7 +10,7 @@ setuid {{user}} console log # --profile -exec nice warcprox \ +exec nice {{venv_root}}/warcprox-ve3/bin/python {{venv_root}}/warcprox-ve3/bin/warcprox \ --address=0.0.0.0 \ --dir={{warcs_dir}} \ --base32 \ From 18b4a26db6ad79a2970aaf726be9f9145903d1ca Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 22 Mar 2019 23:50:46 -0700 Subject: [PATCH 03/31] porting ansible config to xenial no more upstart, switch to daemontools, among other things --- .travis.yml | 6 +- .../brozzler-dashboard/handlers/main.yml | 6 +- .../roles/brozzler-dashboard/tasks/main.yml | 27 +++++-- .../templates/brozzler-dashboard-run.j2 | 13 +++ .../templates/brozzler-dashboard.conf.j2 | 17 ---- .../roles/brozzler-worker/handlers/main.yml | 21 +++-- ansible/roles/brozzler-worker/tasks/main.yml | 57 ++++++++++--- .../brozzler-worker/templates/Xvnc-run.j2 | 14 ++++ .../brozzler-worker/templates/Xvnc.conf.j2 | 14 ---- .../templates/brozzler-worker-run.j2 | 17 ++++ .../templates/brozzler-worker.conf.j2 | 24 ------ .../templates/vnc-websock-run.j2 | 10 +++ .../templates/vnc-websock.conf.j2 | 14 ---- ansible/roles/common/tasks/main.yml | 80 +++++++++++++------ ansible/roles/pywb/handlers/main.yml | 6 +- ansible/roles/pywb/tasks/main.yml | 40 +++++++--- ansible/roles/pywb/templates/pywb-run.j2 | 10 +++ ansible/roles/pywb/templates/pywb.conf.j2 | 12 --- ansible/roles/warcprox/handlers/main.yml | 5 +- ansible/roles/warcprox/tasks/main.yml | 24 ++++-- .../templates/{warcprox.conf.j2 => run.j2} | 21 ++--- vagrant/README.rst | 10 +-- vagrant/Vagrantfile | 4 +- vagrant/run-tests.sh | 4 +- vagrant/vagrant-brozzler-new-job.py | 7 +- vagrant/vagrant-brozzler-new-site.py | 8 +- 26 files changed, 289 insertions(+), 182 deletions(-) create mode 100644 ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 delete mode 100644 ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 create mode 100644 ansible/roles/brozzler-worker/templates/Xvnc-run.j2 delete mode 100644 ansible/roles/brozzler-worker/templates/Xvnc.conf.j2 create mode 100644 ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 delete mode 100644 ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 create mode 100644 ansible/roles/brozzler-worker/templates/vnc-websock-run.j2 delete mode 100644 ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 create mode 100644 ansible/roles/pywb/templates/pywb-run.j2 delete mode 100644 ansible/roles/pywb/templates/pywb.conf.j2 rename ansible/roles/warcprox/templates/{warcprox.conf.j2 => run.j2} (58%) diff --git a/.travis.yml b/.travis.yml index 318ad3b..301b377 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,9 +21,9 @@ script: - DISPLAY=:1 py.test --tb=native -v tests after_failure: - chromium-browser --version -- sudo cat /var/log/upstart/warcprox.log -- sudo cat /var/log/upstart/brozzler-worker.log -- sudo cat /var/log/upstart/pywb.log +- sudo cat /var/log/warcprox.log +- sudo cat /var/log/brozzler-worker.log +- sudo cat /var/log/pywb.log notifications: slack: secure: KPPXSscXnmSEQ2NXBZFKrzDEYHg067Kv1WR7RTRUH8EIlSS9MHTyErRa7HkaRPmqOllj4vvPbplNU2ALnCfhP4cqW+MvF0xv3GuEGXQ7Om2sBvVUQ3w0JJ5rLq9ferAfGdSnQFeViqfDix5LA3fMNZGouUHQdUHq7iO8E9n9jntvkKO9Jff7Dyo0K5KvOZOJfM9KsqFZLlFO5zoNB6Y9jubIT7+Ulk3EDto/Kny34VPIyJIm7y0cHHlYLEq780AweY0EIwMyMg/VPSRrVAsbLSrilO0YRgsQpjPC9Ci/rAWNWooaOk0eA+bwv1uHQnGtH0z446XUMXr3UZ2QlD4DE/uoP2okkl8EtqvlmEyjV8eO86TqYFDRgKfYpvlK6hHtb7SAHX28QeXQjbKNc5f7KpKO5PtZqaoBRL7acLlKyS8xQGiRtonTPFSBTFR2A+s6dZmKO9dDboglptiHk4dvL1ZD4S8qLJn1JjTJqvIU6tpCY3BpNErn4n1MkDjN5nqdXf7Q9Vmui8vRetwnMf1oXcsKj9FEt2utNfDqFNXcFsN+Mnr9rhXQ1++gt/7Zo844OowiARcxqZTNy5LqSD01WgGCvNMy3Odf+FTQ8PcDOF+001+g8La1R99U0o9/hT/gy+WYk2prYneWru4pQHF/a6goZgkLTwkskcaPVpDJtDs= diff --git a/ansible/roles/brozzler-dashboard/handlers/main.yml b/ansible/roles/brozzler-dashboard/handlers/main.yml index 610c470..39c6b75 100644 --- a/ansible/roles/brozzler-dashboard/handlers/main.yml +++ b/ansible/roles/brozzler-dashboard/handlers/main.yml @@ -1,4 +1,8 @@ --- - name: restart brozzler-dashboard - service: name=brozzler-dashboard state=restarted + svc: + name: brozzler-dashboard + state: restarted + service_dir: /etc/service become: true + diff --git a/ansible/roles/brozzler-dashboard/tasks/main.yml b/ansible/roles/brozzler-dashboard/tasks/main.yml index b341d1a..db09af7 100644 --- a/ansible/roles/brozzler-dashboard/tasks/main.yml +++ b/ansible/roles/brozzler-dashboard/tasks/main.yml @@ -3,18 +3,31 @@ file: path={{venv_root}}/brozzler-dashboard-ve3 state=directory owner={{user}} become: true + - name: install brozzler[dashboard] in virtualenv - pip: name='{{brozzler_pip_name}}[dashboard]' - virtualenv={{venv_root}}/brozzler-dashboard-ve3 - virtualenv_python=python3 - extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' + pip: + name: '{{brozzler_pip_name}}[dashboard]' + virtualenv: '{{venv_root}}/brozzler-dashboard-ve3' + virtualenv_python: python3 + virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py + extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' notify: - restart brozzler-dashboard -- name: install upstart config /etc/init/brozzler-dashboard.conf + +- name: mkdir /etc/service/brozzler-dashboard + file: + path: /etc/service/brozzler-dashboard + state: directory become: true - template: src=templates/brozzler-dashboard.conf.j2 - dest=/etc/init/brozzler-dashboard.conf + +- name: install /etc/service/brozzler-dashboard/run + template: + src: templates/brozzler-dashboard-run.j2 + dest: /etc/service/brozzler-dashboard/run + mode: 0755 notify: - restart brozzler-dashboard + become: true + diff --git a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 new file mode 100644 index 0000000..45fe737 --- /dev/null +++ b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 @@ -0,0 +1,13 @@ +#!/bin/bash + +logfile=/var/log/brozzler-dashboard.log +touch $logfile +chown {{user}} $logfile + +exec nice setuidgid {{user}} \ + env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler \ + RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} \ + RETHINKDB_DB=brozzler LANG=en_US.UTF-8 LC_COLLATE=C \ + gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app \ + >> $logfile 2>&1 + diff --git a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 deleted file mode 100644 index 7a8f0bb..0000000 --- a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard.conf.j2 +++ /dev/null @@ -1,17 +0,0 @@ -description "brozzler-dashboard" - -start on runlevel [2345] -stop on runlevel [!2345] - -env PATH={{venv_root}}/brozzler-dashboard-ve3/bin:/usr/bin:/bin -env LC_ALL=C.UTF-8 - -env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler -env RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} -env RETHINKDB_DB=brozzler - -setuid {{user}} - -console log - -exec gunicorn --bind=0.0.0.0:8881 brozzler.dashboard:app diff --git a/ansible/roles/brozzler-worker/handlers/main.yml b/ansible/roles/brozzler-worker/handlers/main.yml index 1fac304..e139240 100644 --- a/ansible/roles/brozzler-worker/handlers/main.yml +++ b/ansible/roles/brozzler-worker/handlers/main.yml @@ -1,13 +1,22 @@ --- - name: restart Xvnc - service: name=Xvnc state=restarted - become: true -- name: restart websockify - service: name=websockify state=restarted + svc: + name: Xvnc + state: restarted + service_dir: /etc/service become: true + - name: restart vnc-websock - service: name=vnc-websock state=restarted + svc: + name: vnc-websock + state: restarted + service_dir: /etc/service become: true + - name: restart brozzler-worker - service: name=brozzler-worker state=restarted + svc: + name: brozzler-worker + state: restarted + service_dir: /etc/service become: true + diff --git a/ansible/roles/brozzler-worker/tasks/main.yml b/ansible/roles/brozzler-worker/tasks/main.yml index bd9512a..ebf5d2d 100644 --- a/ansible/roles/brozzler-worker/tasks/main.yml +++ b/ansible/roles/brozzler-worker/tasks/main.yml @@ -3,8 +3,10 @@ apt_repository: repo='deb http://archive.canonical.com/ubuntu trusty partner' state=present become: true + - apt: update_cache=yes become: true + - name: ensure required packages are installed become: true apt: name={{item}} state=present @@ -33,41 +35,72 @@ - fonts-indic - fonts-thai-tlwg - fonts-lklug-sinhala -- name: install Xvnc upstart config /etc/init/Xvnc.conf - template: src=templates/Xvnc.conf.j2 dest=/etc/init/Xvnc.conf + +- name: mkdir /etc/service/warcprox + file: + path: '/etc/service/{{item}}' + state: directory + with_items: + - Xvnc + - websockify + - vnc-websock + - brozzler-worker become: true + +- name: install /etc/service/Xvnc/run + template: + src: templates/Xvnc-run.j2 + dest: /etc/service/Xvnc/run + mode: 0755 notify: - restart Xvnc + become: true + - name: mkdir {{venv_root}}/websockify-ve3 become: true file: path={{venv_root}}/websockify-ve3 state=directory owner={{user}} + - name: install websockify in virtualenv - pip: name=git+https://github.com/kanaka/websockify.git#egg=websockify - virtualenv={{venv_root}}/websockify-ve3 - virtualenv_python=python3 - extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' + pip: + name: git+https://github.com/kanaka/websockify.git#egg=websockify + virtualenv: '{{venv_root}}/websockify-ve3' + virtualenv_python: python3 + virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py + extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' -- name: install vnc-websock upstart config /etc/init/vnc-websock.conf - template: src=templates/vnc-websock.conf.j2 dest=/etc/init/vnc-websock.conf - become: true + +- name: install /etc/service/vnc-websock/run + template: + src: templates/vnc-websock-run.j2 + dest: /etc/service/vnc-websock/run + mode: 0755 notify: - restart vnc-websock + become: true + - name: mkdir {{venv_root}}/brozzler-ve3 become: true file: path={{venv_root}}/brozzler-ve3 state=directory owner={{user}} + - name: install brozzler in virtualenv pip: name: '{{brozzler_pip_name}}' virtualenv: '{{venv_root}}/brozzler-ve3' virtualenv_python: python3 + virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' notify: - restart brozzler-worker -- name: install brozzler-worker upstart config /etc/init/brozzler-worker.conf - template: src=templates/brozzler-worker.conf.j2 dest=/etc/init/brozzler-worker.conf - become: true + +- name: install /etc/service/brozzler-worker/run + template: + src: templates/brozzler-worker-run.j2 + dest: /etc/service/brozzler-worker/run + mode: 0755 notify: - restart brozzler-worker + become: true + diff --git a/ansible/roles/brozzler-worker/templates/Xvnc-run.j2 b/ansible/roles/brozzler-worker/templates/Xvnc-run.j2 new file mode 100644 index 0000000..e8d573d --- /dev/null +++ b/ansible/roles/brozzler-worker/templates/Xvnc-run.j2 @@ -0,0 +1,14 @@ +#!/bin/bash + +cd /tmp + +logfile=/var/log/Xvnc.log +touch $logfile +chown {{user}} $logfile + +exec nice setuidgid {{user}} Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \ + -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \ + -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \ + AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 \ + >> $logfile 2>&1 + diff --git a/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2 b/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2 deleted file mode 100644 index 57ece99..0000000 --- a/ansible/roles/brozzler-worker/templates/Xvnc.conf.j2 +++ /dev/null @@ -1,14 +0,0 @@ -description "Xvnc" - -start on runlevel [2345] -stop on runlevel [!2345] - -setuid {{user}} - -console log - -exec nice Xvnc4 :1 -auth /tmp/Xauthority.{{user}} \ - -geometry 1600x1000 -depth 24 -rfbwait 0 -nolisten tcp -rfbport 5901 \ - -SecurityTypes None -pn -fp /usr/share/fonts/X11/misc/ -co /etc/X11/rgb \ - AcceptCutText=0 AcceptPointerEvents=0 AcceptKeyEvents=0 - diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 new file mode 100644 index 0000000..9889ef7 --- /dev/null +++ b/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 @@ -0,0 +1,17 @@ +#!/bin/bash + +logfile=/var/log/brozzler-worker.log +touch $logfile +chown {{user}} $logfile + +source {{venv_root}}/brozzler-ve3/bin/activate + +exec nice setuidgid {{user}} \ + env LANG=en_US.UTF-8 LC_COLLATE=C \ + brozzler-worker \ + --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \ + --max-browsers=4 \ + --verbose \ + --warcprox-auto \ + >> $logfile 2>&1 + diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 deleted file mode 100644 index 5b9f711..0000000 --- a/ansible/roles/brozzler-worker/templates/brozzler-worker.conf.j2 +++ /dev/null @@ -1,24 +0,0 @@ -description "brozzler-worker" - -start on runlevel [2345] -stop on runlevel [!2345] - -env DISPLAY=:1 -env PATH={{venv_root}}/brozzler-ve3/bin:/usr/bin:/bin -env LANG=C.UTF-8 - -setuid {{user}} - -console log - -# depends on vnc server -start on started Xvnc -stop on stopping Xvnc - -kill timeout 60 - -exec nice brozzler-worker \ - --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \ - --max-browsers=4 \ - --verbose \ - --warcprox-auto diff --git a/ansible/roles/brozzler-worker/templates/vnc-websock-run.j2 b/ansible/roles/brozzler-worker/templates/vnc-websock-run.j2 new file mode 100644 index 0000000..522c125 --- /dev/null +++ b/ansible/roles/brozzler-worker/templates/vnc-websock-run.j2 @@ -0,0 +1,10 @@ +#!/bin/bash + +logfile=/var/log/vnc-websock.log +touch $logfile +chown {{user}} $logfile + +source /opt/websockify-ve3/bin/activate + +exec nice setuidgid {{user}} websockify 0.0.0.0:8901 localhost:5901 >> $logfile 2>&1 + diff --git a/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 b/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 deleted file mode 100644 index a26345d..0000000 --- a/ansible/roles/brozzler-worker/templates/vnc-websock.conf.j2 +++ /dev/null @@ -1,14 +0,0 @@ -description "vnc-websock" - -start on runlevel [2345] -stop on runlevel [!2345] - -setuid {{user}} - -console log - -env PATH={{venv_root}}/websockify-ve3/bin:/usr/bin:/bin - -# port 8901 is hard-coded in brozzler/dashboard/static/partials/workers.html -exec nice websockify 0.0.0.0:8901 localhost:5901 - diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml index 2167ba1..6ff90a2 100644 --- a/ansible/roles/common/tasks/main.yml +++ b/ansible/roles/common/tasks/main.yml @@ -1,44 +1,74 @@ --- -# get latest pip (had problems with version from apt-get, specifically -# "pip install pyopenssl" did not install the dependency "cryptography") -# http://stackoverflow.com/questions/34587473/what-is-get-pip-py-checksum-where-can-i-get-it-for-sure -- name: install setuptools for python 2 and 3 +- apt: + name: + - python3-setuptools + - python3-pip + - python3-virtualenv + - daemontools + - daemontools-run + state: present + update_cache: yes + cache_valid_time: 86400 # one day become: true - apt: name={{item}} state=present - with_items: - - python-setuptools - - python3-setuptools -- name: download pip-9.0.1.tar.gz - get_url: - url: https://pypi.python.org/packages/11/b6/abcb525026a4be042b486df43905d6893fb04f05aac21c32c638e939e447/pip-9.0.1.tar.gz - dest: /tmp - checksum: sha1:57ff41e99cb01b6a1c2b0999161589b726f0ec8b -- name: extract pip-9.0.1.tar.gz - unarchive: src=/tmp/pip-9.0.1.tar.gz dest=/tmp copy=no + +# # get recent virtualenv, which bundles a recent pip +# - find: +# paths: +# - /usr/local/lib/python3.4/dist-packages +# - /usr/local/lib/python3.5/dist-packages +# recurse: true +# patterns: virtualenv.py +# contains: '__version__ = "16.4.3"' +# register: virtualenv_py_16_4_3 +# +# - command: mktemp -d +# register: mktempd_out +# when: virtualenv_py_16_4_3.matched == 0 +# +# - name: download virtualenv-16.4.3 +# get_url: +# url: https://files.pythonhosted.org/packages/37/db/89d6b043b22052109da35416abc3c397655e4bd3cff031446ba02b9654fa/virtualenv-16.4.3.tar.gz +# dest: '{{mktempd_out.stdout}}' +# checksum: sha256:984d7e607b0a5d1329425dd8845bd971b957424b5ba664729fab51ab8c11bc39 +# when: virtualenv_py_16_4_3.matched == 0 +# +# - name: extract virtualenv-16.4.3.tar.gz +# unarchive: +# src: '{{mktempd_out.stdout}}/virtualenv-16.4.3.tar.gz' +# dest: '{{mktempd_out.stdout}}' +# copy: no +# when: virtualenv_py_16_4_3.matched == 0 +# +# - name: run "python3 setup.py install" in {{mktempd_out.stdout}}/virtualenv-16.4.3 +# become: true +# command: python3 setup.py install +# args: +# chdir: '{{mktempd_out.stdout}}/virtualenv-16.4.3' +# when: virtualenv_py_16_4_3.matched == 0 +# +# - file: +# path: '{{mktempd_out.stdout}}' +# state: absent +# become: true +# when: virtualenv_py_16_4_3.matched == 0 # this clause is a workaround for travis-ci, which only wants to install in /usr # see https://travis-ci.org/internetarchive/brozzler/builds/174338601 # but it complains that /usr/lib/python3.5/site-packages doesn't exist # see https://travis-ci.org/internetarchive/brozzler/builds/174094831 -- file: path={{item}} state=directory +- file: + path: '{{item}}' + state: directory with_items: - /usr/lib/python3.5/site-packages - /usr/lib/python3.5/dist-packages become: true -- name: run "python3 setup.py install" in /tmp/pip-9.0.1 - command: python3 setup.py install - chdir=/tmp/pip-9.0.1 - creates=/usr/local/lib/python3.5/dist-packages/pip-9.0.1-py3.5.egg/pip/__init__.py - become: true -- name: run "pip install virtualenv" - command: pip install virtualenv - creates=/usr/local/lib/python3.5/dist-packages/virtualenv.py - become: true - command: id {{user}} register: id_user ignore_errors: true changed_when: false + - name: ensure service user {{user}} exists user: name={{user}} system=yes createhome=no home=/nonexistent shell=/usr/sbin/nologin diff --git a/ansible/roles/pywb/handlers/main.yml b/ansible/roles/pywb/handlers/main.yml index 4424b3e..744d30a 100644 --- a/ansible/roles/pywb/handlers/main.yml +++ b/ansible/roles/pywb/handlers/main.yml @@ -1,5 +1,9 @@ --- - name: restart pywb - service: name=pywb state=restarted + svc: + name: pywb + state: restarted + service_dir: /etc/service become: true + diff --git a/ansible/roles/pywb/tasks/main.yml b/ansible/roles/pywb/tasks/main.yml index 7ffe49c..532ad4c 100644 --- a/ansible/roles/pywb/tasks/main.yml +++ b/ansible/roles/pywb/tasks/main.yml @@ -3,34 +3,50 @@ file: path={{venv_root}}/pywb-ve3 state=directory owner={{user}} become: true + - name: install pywb in virtualenv - pip: name=pywb - version=0.33.2 - virtualenv={{venv_root}}/pywb-ve3 - virtualenv_python=python3 - extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' + pip: + name: pywb + version: 0.33.2 + virtualenv: '{{venv_root}}/pywb-ve3' + virtualenv_python: python3 + virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py + extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' notify: - restart pywb + - name: install brozzler in pywb virtualenv - pip: name='{{brozzler_pip_name}}' - virtualenv={{venv_root}}/pywb-ve3 - virtualenv_python=python3 - extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' + pip: + name: '{{brozzler_pip_name}}' + virtualenv: '{{venv_root}}/pywb-ve3' + virtualenv_python: python3 + virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py + extra_args: '--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' become: true become_user: '{{user}}' notify: - restart pywb + - name: pywb config file /etc/pywb.yml template: src=templates/pywb.yml.j2 dest=/etc/pywb.yml become: true notify: - restart pywb -- name: upstart config file /etc/init/pywb.conf - template: src=templates/pywb.conf.j2 - dest=/etc/init/pywb.conf + +- name: mkdir /etc/service/pywb + file: + path: /etc/service/pywb + state: directory become: true + +- name: install /etc/service/pywb/run + template: + src: templates/pywb-run.j2 + dest: /etc/service/pywb/run + mode: 0755 notify: - restart pywb + become: true diff --git a/ansible/roles/pywb/templates/pywb-run.j2 b/ansible/roles/pywb/templates/pywb-run.j2 new file mode 100644 index 0000000..26a40f9 --- /dev/null +++ b/ansible/roles/pywb/templates/pywb-run.j2 @@ -0,0 +1,10 @@ +#!/bin/bash + +logfile=/var/log/pywb.log +touch $logfile +chown {{user}} $logfile + +exec nice setuidgid {{user}} env PYWB_CONFIG_FILE=/etc/pywb.yml \ + {{venv_root}}/pywb-ve3/bin/python {{venv_root}}/pywb-ve3/bin/brozzler-wayback \ + >> $logfile 2>&1 + diff --git a/ansible/roles/pywb/templates/pywb.conf.j2 b/ansible/roles/pywb/templates/pywb.conf.j2 deleted file mode 100644 index 6b3450c..0000000 --- a/ansible/roles/pywb/templates/pywb.conf.j2 +++ /dev/null @@ -1,12 +0,0 @@ -description "pywb" - -start on runlevel [2345] -stop on runlevel [!2345] - -env PYWB_CONFIG_FILE=/etc/pywb.yml - -setuid {{user}} - -console log - -exec nice {{venv_root}}/pywb-ve3/bin/python {{venv_root}}/pywb-ve3/bin/brozzler-wayback diff --git a/ansible/roles/warcprox/handlers/main.yml b/ansible/roles/warcprox/handlers/main.yml index 0b7edcd..8fbb3c9 100644 --- a/ansible/roles/warcprox/handlers/main.yml +++ b/ansible/roles/warcprox/handlers/main.yml @@ -1,4 +1,7 @@ --- - name: restart warcprox - service: name=warcprox state=restarted + svc: + name: warcprox + state: restarted + service_dir: /etc/service become: true diff --git a/ansible/roles/warcprox/tasks/main.yml b/ansible/roles/warcprox/tasks/main.yml index 74e8c59..8027606 100644 --- a/ansible/roles/warcprox/tasks/main.yml +++ b/ansible/roles/warcprox/tasks/main.yml @@ -13,16 +13,28 @@ become: true file: path={{venv_root}}/warcprox-ve3 state=directory owner={{user}} - name: install warcprox in virtualenv - pip: name=git+https://github.com/internetarchive/warcprox.git#egg=warcprox - virtualenv={{venv_root}}/warcprox-ve3 - virtualenv_python=python3 - extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' + pip: + name: git+https://github.com/internetarchive/warcprox.git#egg=warcprox + virtualenv: '{{venv_root}}/warcprox-ve3' + virtualenv_python: python3 + extra_args: --no-input --upgrade --pre --cache-dir=/tmp/pip-cache + virtualenv_command: python3 /usr/lib/python3/dist-packages/virtualenv.py become: true become_user: '{{user}}' notify: - restart warcprox -- name: install upstart config /etc/init/warcprox.conf + +- name: mkdir /etc/service/warcprox + file: + path: /etc/service/warcprox + state: directory become: true - template: src=templates/warcprox.conf.j2 dest=/etc/init/warcprox.conf + +- name: install /etc/service/warcprox/run + template: + src: templates/run.j2 + dest: /etc/service/warcprox/run + mode: 0755 notify: - restart warcprox + become: true diff --git a/ansible/roles/warcprox/templates/warcprox.conf.j2 b/ansible/roles/warcprox/templates/run.j2 similarity index 58% rename from ansible/roles/warcprox/templates/warcprox.conf.j2 rename to ansible/roles/warcprox/templates/run.j2 index 61f36ba..30cd173 100644 --- a/ansible/roles/warcprox/templates/warcprox.conf.j2 +++ b/ansible/roles/warcprox/templates/run.j2 @@ -1,16 +1,15 @@ -description "warcprox" +#!/bin/bash -start on runlevel [2345] -stop on runlevel [!2345] -# by default warcprox creates some files/dirs relative to cwd -chdir {{work_dir}} -setuid {{user}} +logfile=/var/log/warcprox.log +touch $logfile +chown {{user}} $logfile -console log +ulimit -n 4096 -# --profile -exec nice {{venv_root}}/warcprox-ve3/bin/python {{venv_root}}/warcprox-ve3/bin/warcprox \ +source {{venv_root}}/warcprox-ve3/bin/activate + +exec nice -n5 setuidgid {{user}} env LANG=en_US.UTF-8 LC_COLLATE=C warcprox \ --address=0.0.0.0 \ --dir={{warcs_dir}} \ --base32 \ @@ -19,4 +18,6 @@ exec nice {{venv_root}}/warcprox-ve3/bin/python {{venv_root}}/warcprox-ve3/bin/w --onion-tor-socks-proxy=localhost:9050 \ --rethinkdb-services-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/services \ --rethinkdb-stats-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/stats \ - --rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures + --rethinkdb-big-table-url=rethinkdb://{{groups['rethinkdb']|join(',')}}/brozzler/captures \ + >> $logfile 2>&1 + diff --git a/vagrant/README.rst b/vagrant/README.rst index fdb96bc..8b54e59 100644 --- a/vagrant/README.rst +++ b/vagrant/README.rst @@ -24,27 +24,27 @@ the brozzler virtualenv. :: my-laptop$ vagrant ssh - vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate - (brozzler-ve34)vagrant@brzl:~$ + vagrant@brzl:~$ source /opt/brozzler-ve3/bin/activate + (brozzler-ve3)vagrant@brzl:~$ Then you can run brozzler-new-site: :: - (brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/ + (brozzler-ve3)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/ Or brozzler-new-job (make sure to set the proxy to localhost:8000): :: - (brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <job1.yml <=2.1b1.dev86"' -vagrant ssh -- "source /opt/brozzler-ve34/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@" +vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"' +vagrant ssh -- "source /opt/brozzler-ve3/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@" diff --git a/vagrant/vagrant-brozzler-new-job.py b/vagrant/vagrant-brozzler-new-job.py index 454e45a..c75d075 100755 --- a/vagrant/vagrant-brozzler-new-job.py +++ b/vagrant/vagrant-brozzler-new-job.py @@ -7,7 +7,7 @@ This is a standalone script with no dependencies other than python, and should work with python 2.7 or python 3.2+. The only reason it's not a bash script is so we can use the argparse library. -Copyright (C) 2016 Internet Archive +Copyright (C) 2016-2019 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -41,9 +41,8 @@ def main(argv=[]): subprocess.call([ 'vagrant', 'ssh', '--', 'f=`mktemp` && cat > $f && ' - 'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' - '/home/vagrant/brozzler-ve34/bin/python ' - '/home/vagrant/brozzler-ve34/bin/brozzler-new-job $f'], + '/home/vagrant/brozzler-ve3/bin/python ' + '/home/vagrant/brozzler-ve3/bin/brozzler-new-job $f'], stdin=f) if __name__ == '__main__': diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index 99401c5..158095e 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -74,11 +74,9 @@ def main(argv=[]): os.chdir(os.path.dirname(__file__)) cmd = ( - 'PYTHONPATH=/home/vagrant/brozzler-ve34/lib/python3.4/site-packages ' - '/home/vagrant/brozzler-ve34/bin/python ' - '/home/vagrant/brozzler-ve34/bin/brozzler-new-site ' - '--proxy=localhost:8000 %s %s') % ( - ' '.join(options), args.seed) + '/home/vagrant/brozzler-ve3/bin/python ' + '/home/vagrant/brozzler-ve3/bin/brozzler-new-site ' + '--proxy=localhost:8000 %s %s') % (' '.join(options), args.seed) subprocess.call(['vagrant', 'ssh', '--', cmd]) if __name__ == '__main__': From 48bb03418ffaedddbc28066b6b734558a261cc33 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sat, 23 Mar 2019 00:26:39 -0700 Subject: [PATCH 04/31] daemontools --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 301b377..f6cda55 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ install: - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser - chromium-browser --version -- sudo service brozzler-worker restart +- sudo svc -t /etc/service/brozzler-worker script: - DISPLAY=:1 py.test --tb=native -v tests after_failure: From 9c658cddf7693d19d733daa5e483d96c270f966b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Sun, 24 Mar 2019 16:06:36 -0700 Subject: [PATCH 05/31] fix a couple of svc definitions --- .../brozzler-dashboard/templates/brozzler-dashboard-run.j2 | 2 ++ ansible/roles/warcprox/templates/run.j2 | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 index 45fe737..da40564 100644 --- a/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 +++ b/ansible/roles/brozzler-dashboard/templates/brozzler-dashboard-run.j2 @@ -4,6 +4,8 @@ logfile=/var/log/brozzler-dashboard.log touch $logfile chown {{user}} $logfile +source /opt/brozzler-dashboard-ve3/bin/activate + exec nice setuidgid {{user}} \ env WAYBACK_BASEURL=http://{{groups['pywb'][0]}}:8880/brozzler \ RETHINKDB_SERVERS={{groups['rethinkdb'] | join(',')}} \ diff --git a/ansible/roles/warcprox/templates/run.j2 b/ansible/roles/warcprox/templates/run.j2 index 30cd173..161c79f 100644 --- a/ansible/roles/warcprox/templates/run.j2 +++ b/ansible/roles/warcprox/templates/run.j2 @@ -1,12 +1,13 @@ #!/bin/bash - logfile=/var/log/warcprox.log touch $logfile chown {{user}} $logfile ulimit -n 4096 +cd {{work_dir}} + source {{venv_root}}/warcprox-ve3/bin/activate exec nice -n5 setuidgid {{user}} env LANG=en_US.UTF-8 LC_COLLATE=C warcprox \ From 8b6e5cbfb936feb92793d0c43f789a75a0fc6b5b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 2 Apr 2019 17:58:13 +0000 Subject: [PATCH 06/31] new option brozzler-purge --finished-before=... --- brozzler/cli.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index 188d591..458e447 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -2,7 +2,7 @@ ''' brozzler/cli.py - brozzler command line executables -Copyright (C) 2014-2017 Internet Archive +Copyright (C) 2014-2019 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -605,6 +605,10 @@ def brozzler_purge(argv=None): '--site', dest='site', metavar='SITE_ID', help=( 'purge crawl state from rethinkdb for a site, including all ' 'pages')) + group.add_argument( + '--finished-before', dest='finished_before', metavar='YYYY-MM-DD', + help=('purge crawl state from rethinkdb for a jobs that ended ' + 'before this date')) arg_parser.add_argument( '--force', dest='force', action='store_true', help=( 'purge even if job or site is still has status ACTIVE')) @@ -653,6 +657,20 @@ def brozzler_purge(argv=None): '(override with --force)', site_id) sys.exit(1) _purge_site(rr, site_id) + elif args.finished_before: + finished_before = datetime.datetime.strptime( + args.finished_before, '%Y-%m-%d').replace( + tzinfo=doublethink.UTC) + reql = rr.table('jobs').filter( + r.row['finished'].default(r.maxval).lt(finished_before).or_( + r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before))) + logging.debug( + 'retrieving jobs older than %s: %s', finished_before, reql) + for job in reql.run(): + # logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s', + # job['id'], job.get('finished'), + # job.get('starts_and_stops', [{'stop':None}])[-1]['stop']) + _purge_job(rr, job['id']) def _purge_site(rr, site_id): reql = rr.table('pages').between( From 06e072a7168ce7cc9697d2a783b7211177a40e48 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 2 Apr 2019 17:58:35 +0000 Subject: [PATCH 07/31] update some dependencies --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 9bbfd3b..c9a240c 100644 --- a/setup.py +++ b/setup.py @@ -64,10 +64,10 @@ setuptools.setup( ], }, install_requires=[ - 'PyYAML>=3.12', + 'PyYAML>=5.8', 'youtube-dl>=2018.7.21', 'reppy==0.3.4', - 'requests>=2.18.4', + 'requests>=2.21', 'websocket-client>=0.39.0,<=0.48.0', 'pillow>=5.2.0', 'urlcanon>=0.1.dev23', @@ -80,13 +80,13 @@ setuptools.setup( ], extras_require={ 'dashboard': [ - 'flask>=0.11', + 'flask>=1.0', 'gunicorn>=19.8.1' ], 'easy': [ 'warcprox>=2.4b2.dev173', 'pywb>=0.33.2,<2', - 'flask>=0.11', + 'flask>=1.0', 'gunicorn>=19.8.1' ], }, From 85c6ac0ab208e91c8ed573626428a8695e759747 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 2 Apr 2019 12:05:08 -0700 Subject: [PATCH 08/31] fix next travis-ci problem --- ansible/roles/brozzler-worker/tasks/main.yml | 3 +-- tests/test_cluster.py | 4 ++-- vagrant/run-tests.sh | 10 +++++----- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/ansible/roles/brozzler-worker/tasks/main.yml b/ansible/roles/brozzler-worker/tasks/main.yml index ebf5d2d..4945052 100644 --- a/ansible/roles/brozzler-worker/tasks/main.yml +++ b/ansible/roles/brozzler-worker/tasks/main.yml @@ -36,13 +36,12 @@ - fonts-thai-tlwg - fonts-lklug-sinhala -- name: mkdir /etc/service/warcprox +- name: mkdir /etc/service/{Xvnc,vnc-websock,brozzler-worker} file: path: '/etc/service/{{item}}' state: directory with_items: - Xvnc - - websockify - vnc-websock - brozzler-worker become: true diff --git a/tests/test_cluster.py b/tests/test_cluster.py index f5007a1..13b3c66 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -35,10 +35,10 @@ import logging import warcprox def start_service(service): - subprocess.check_call(['sudo', 'service', service, 'start']) + subprocess.check_call(['sudo', 'svc', '-u', '/etc/service/' + service]) def stop_service(service): - subprocess.check_call(['sudo', 'service', service, 'stop']) + subprocess.check_call(['sudo', 'svc', '-d', '/etc/service/' + service]) @pytest.fixture(scope='module') def httpd(request): diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh index 6ef8022..515b37b 100755 --- a/vagrant/run-tests.sh +++ b/vagrant/run-tests.sh @@ -10,11 +10,11 @@ cd $(dirname "${BASH_SOURCE[0]}") vagrant up echo service status: -vagrant ssh -- 'status warcprox ; - status Xvnc ; - status brozzler-worker ; - status brozzler-dashboard ; - status vnc-websock' +vagrant ssh -- 'sudo svcstat /etc/service/warcprox ; + sudo svcstat /etc/service/Xvnc ; + sudo svcstat /etc/service/brozzler-worker ; + sudo svcstat /etc/service/brozzler-dashboard ; + sudo svcstat /etc/service/vnc-websock' echo vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"' From 68ce9eac762d375b1e73b1a744b3d595e2684042 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 2 Apr 2019 13:05:36 -0700 Subject: [PATCH 09/31] debugging travis-ci is a slow process --- .travis.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f6cda55..9f5b6b6 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,8 +17,11 @@ install: - sudo apt-get install --only-upgrade chromium-browser - chromium-browser --version - sudo svc -t /etc/service/brozzler-worker +- sleep 10 +- sudo cat /var/log/brozzler-worker.log +- sudo cat /var/log/warcprox.log script: -- DISPLAY=:1 py.test --tb=native -v tests +- DISPLAY=:1 py.test --tb=native -x -v tests after_failure: - chromium-browser --version - sudo cat /var/log/warcprox.log From 9459ed40d02e2c8ce23487feec745d6f2f9a8288 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 12:38:41 -0700 Subject: [PATCH 10/31] fix typo --- vagrant/run-tests.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh index 515b37b..b1ff00b 100755 --- a/vagrant/run-tests.sh +++ b/vagrant/run-tests.sh @@ -10,11 +10,11 @@ cd $(dirname "${BASH_SOURCE[0]}") vagrant up echo service status: -vagrant ssh -- 'sudo svcstat /etc/service/warcprox ; - sudo svcstat /etc/service/Xvnc ; - sudo svcstat /etc/service/brozzler-worker ; - sudo svcstat /etc/service/brozzler-dashboard ; - sudo svcstat /etc/service/vnc-websock' +vagrant ssh -- 'sudo svstat /etc/service/warcprox ; + sudo svstat /etc/service/Xvnc ; + sudo svstat /etc/service/brozzler-worker ; + sudo svstat /etc/service/brozzler-dashboard ; + sudo svstat /etc/service/vnc-websock' echo vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"' From 899794f2dafb55ac451c061feb3953d0cfb6e6e4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 12:38:46 -0700 Subject: [PATCH 11/31] debug what's going on with chromium in travis see https://travis-ci.org/internetarchive/brozzler/jobs/514858838 (unroll "sudo cat /var/log/brozzler-worker.log") 2019-04-02 20:16:01,792 18595 CRITICAL BrozzlingThread:42073 brozzler.worker.BrozzlerWorker.brozzle_site(worker.py:412) unexpected exception Traceback (most recent call last): File "/opt/brozzler-ve3/lib/python3.6/site-packages/brozzler/worker.py", line 379, in brozzle_site enable_youtube_dl=not self._skip_youtube_dl) File "/opt/brozzler-ve3/lib/python3.6/site-packages/brozzler/worker.py", line 215, in brozzle_page browser, site, page, on_screenshot, on_request) File "/opt/brozzler-ve3/lib/python3.6/site-packages/brozzler/worker.py", line 292, in _browse_page cookie_db=site.get('cookie_db')) File "/opt/brozzler-ve3/lib/python3.6/site-packages/brozzler/browser.py", line 341, in start self.websock_url = self.chrome.start(**kwargs) File "/opt/brozzler-ve3/lib/python3.6/site-packages/brozzler/chrome.py", line 200, in start return self._websocket_url() File "/opt/brozzler-ve3/lib/python3.6/site-packages/brozzler/chrome.py", line 247, in _websocket_url raise e Exception: chrome process died with status 1 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 9f5b6b6..1f5dcd4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ install: - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser - chromium-browser --version +- timeout 20 chromium-browser - sudo svc -t /etc/service/brozzler-worker - sleep 10 - sudo cat /var/log/brozzler-worker.log From 8303fd3ab357ccb4ff10aed3e1a65e067e452482 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 12:50:50 -0700 Subject: [PATCH 12/31] guessing DISPLAY was the issue here https://travis-ci.org/internetarchive/brozzler/jobs/515882174#L610 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1f5dcd4..952e7b2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ install: - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser - chromium-browser --version -- timeout 20 chromium-browser +- DISPLAY=:1 timeout 20 chromium-browser - sudo svc -t /etc/service/brozzler-worker - sleep 10 - sudo cat /var/log/brozzler-worker.log From 45ac12117a0b5fc0e9da010ada106dd5425edb5e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 13:09:02 -0700 Subject: [PATCH 13/31] maybe Xvnc.log will tell us something --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 952e7b2..bc5b0e2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ install: - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser - chromium-browser --version +- sudo cat /var/log/Xvnc.log - DISPLAY=:1 timeout 20 chromium-browser - sudo svc -t /etc/service/brozzler-worker - sleep 10 From 0d46d8ce19a1fd2e5fe658ace4c9471edd7f98f3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 13:15:17 -0700 Subject: [PATCH 14/31] still trying to figure out what's up with chromium --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index bc5b0e2..8ec77f1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,8 +16,9 @@ install: - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser - chromium-browser --version +- ps -fHe - sudo cat /var/log/Xvnc.log -- DISPLAY=:1 timeout 20 chromium-browser +- DISPLAY=:1 time timeout 20 chromium-browser - sudo svc -t /etc/service/brozzler-worker - sleep 10 - sudo cat /var/log/brozzler-worker.log From 6d145c87c8062c3cbb41bbfc6dc3275fd9251852 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 13:24:12 -0700 Subject: [PATCH 15/31] chromium-browser --disable-extensions ? --- .travis.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8ec77f1..de279fb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,9 +16,10 @@ install: - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser - chromium-browser --version -- ps -fHe +- ps ww -fHe - sudo cat /var/log/Xvnc.log -- DISPLAY=:1 time timeout 20 chromium-browser +- time DISPLAY=:1 timeout 20 chromium-browser --disable-extensions +- time DISPLAY=:1 timeout 20 chromium-browser - sudo svc -t /etc/service/brozzler-worker - sleep 10 - sudo cat /var/log/brozzler-worker.log From 473e891fb4ee448fb19b0537feda38d21bbf4233 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 13:34:45 -0700 Subject: [PATCH 16/31] not sure if --disable-extensions did something --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index de279fb..45bbeb3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,8 +18,8 @@ install: - chromium-browser --version - ps ww -fHe - sudo cat /var/log/Xvnc.log -- time DISPLAY=:1 timeout 20 chromium-browser --disable-extensions -- time DISPLAY=:1 timeout 20 chromium-browser +- time DISPLAY=:1 timeout 20 chromium-browser || true +- time DISPLAY=:1 timeout 20 chromium-browser --disable-extensions || true - sudo svc -t /etc/service/brozzler-worker - sleep 10 - sudo cat /var/log/brozzler-worker.log From 58d1d1c42989c61dc0accd927910c3c2b2e8d90c Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 14:38:29 -0700 Subject: [PATCH 17/31] chromium-browser with no args isn't dying at start what about with all the args? --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 45bbeb3..252f711 100644 --- a/.travis.yml +++ b/.travis.yml @@ -19,7 +19,8 @@ install: - ps ww -fHe - sudo cat /var/log/Xvnc.log - time DISPLAY=:1 timeout 20 chromium-browser || true -- time DISPLAY=:1 timeout 20 chromium-browser --disable-extensions || true +- mkdir -vp /tmp/chium +- time HOME=/tmp/chium DISPLAY=:1 timeout 20 chromium-browser --remote-debugging-port=9222 --use-mock-keychain --user-data-dir=/tmp/chium/chrome-user-data --disable-background-networking --disable-renderer-backgrounding --disable-hang-monitor --disable-background-timer-throttling --mute-audio --disable-web-sockets --window-size=1100,900 --no-default-browser-check --disable-first-run-ui --no-first-run --homepage=about:blank --disable-direct-npapi-requests --disable-web-security --disable-notifications --disable-extensions --disable-save-password-bubble --ignore-certificate-errors --proxy-server=localhost:8000 about:blank || true - sudo svc -t /etc/service/brozzler-worker - sleep 10 - sudo cat /var/log/brozzler-worker.log From 55541be9e9cd4dde7e192b55e6c1e932bb63baec Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 15:11:24 -0700 Subject: [PATCH 18/31] let's see chromium output inside brozzler-worker using --trace, because chromium seems to be working ok when we just run it --- ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 index 9889ef7..87a1c8f 100644 --- a/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 +++ b/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 @@ -11,7 +11,7 @@ exec nice setuidgid {{user}} \ brozzler-worker \ --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \ --max-browsers=4 \ - --verbose \ + --trace \ --warcprox-auto \ >> $logfile 2>&1 From fd0fe811e9b06481347ca46581839849de41cd8e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 16:09:21 -0700 Subject: [PATCH 19/31] so little output from chromium-browser :( https://travis-ci.org/internetarchive/brozzler/jobs/515942434 could it be problems running as this other user? --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 252f711..0336fdd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,6 +11,7 @@ before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml +- head -999 /etc/service/*/run - pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest - chromium-browser --version - sudo apt-get update @@ -19,6 +20,7 @@ install: - ps ww -fHe - sudo cat /var/log/Xvnc.log - time DISPLAY=:1 timeout 20 chromium-browser || true +- time DISPLAY=:1 sudo -u brozzler timeout 20 chromium-browser || true - mkdir -vp /tmp/chium - time HOME=/tmp/chium DISPLAY=:1 timeout 20 chromium-browser --remote-debugging-port=9222 --use-mock-keychain --user-data-dir=/tmp/chium/chrome-user-data --disable-background-networking --disable-renderer-backgrounding --disable-hang-monitor --disable-background-timer-throttling --mute-audio --disable-web-sockets --window-size=1100,900 --no-default-browser-check --disable-first-run-ui --no-first-run --homepage=about:blank --disable-direct-npapi-requests --disable-web-security --disable-notifications --disable-extensions --disable-save-password-bubble --ignore-certificate-errors --proxy-server=localhost:8000 about:blank || true - sudo svc -t /etc/service/brozzler-worker From dfd9d9ecdd494af544bab2b0608d3d71ec2d145e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 4 Apr 2019 17:22:15 -0700 Subject: [PATCH 20/31] omfg --- ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 b/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 index 87a1c8f..855411f 100644 --- a/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 +++ b/ansible/roles/brozzler-worker/templates/brozzler-worker-run.j2 @@ -7,7 +7,7 @@ chown {{user}} $logfile source {{venv_root}}/brozzler-ve3/bin/activate exec nice setuidgid {{user}} \ - env LANG=en_US.UTF-8 LC_COLLATE=C \ + env DISPLAY=:1 LANG=en_US.UTF-8 LC_COLLATE=C \ brozzler-worker \ --rethinkdb-servers={{groups['rethinkdb'] | join(',')}} \ --max-browsers=4 \ From 433b201b5284b8ad7ff1ce2b887e864f0a28d4f7 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 9 Apr 2019 01:43:38 -0700 Subject: [PATCH 21/31] use logging.warning() to quiet py37 warnings --- brozzler/__init__.py | 2 +- brozzler/chrome.py | 8 ++++---- brozzler/cli.py | 6 +++--- brozzler/easy.py | 2 +- brozzler/frontier.py | 8 ++++---- brozzler/robots.py | 2 +- brozzler/worker.py | 8 ++++---- brozzler/ydl.py | 4 ++-- tests/test_cluster.py | 2 +- 9 files changed, 21 insertions(+), 21 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index ccfaacd..c97835f 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -159,7 +159,7 @@ class ThreadExceptionGate: def queue_exception(self, e): with self.lock: if self.pending_exception: - self.logger.warn( + self.logger.warning( '%r already pending for thread %r, discarding %r', self.pending_exception, self.thread, e) else: diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 5928586..c70296f 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -223,7 +223,7 @@ class Chrome: raise except Exception as e: if time.time() - self._last_warning > 30: - self.logger.warn( + self.logger.warning( 'problem with %s (will keep trying until timeout ' 'of %d seconds): %s', json_url, timeout_sec, e) self._last_warning = time.time() @@ -294,7 +294,7 @@ class Chrome: 'chrome pid %s exited normally', self.chrome_process.pid) else: - self.logger.warn( + self.logger.warning( 'chrome pid %s exited with nonzero status %s', self.chrome_process.pid, status) @@ -305,13 +305,13 @@ class Chrome: return time.sleep(0.5) - self.logger.warn( + self.logger.warning( 'chrome pid %s still alive %.1f seconds after sending ' 'SIGTERM, sending SIGKILL', self.chrome_process.pid, time.time() - t0) os.killpg(self.chrome_process.pid, signal.SIGKILL) status = self.chrome_process.wait() - self.logger.warn( + self.logger.warning( 'chrome pid %s reaped (status=%s) after killing with ' 'SIGKILL', self.chrome_process.pid, status) diff --git a/brozzler/cli.py b/brozzler/cli.py index 188d591..4b0bd76 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -627,7 +627,7 @@ def brozzler_purge(argv=None): sys.exit(1) if job.status == 'ACTIVE': if args.force: - logging.warn( + logging.warning( 'job %s has status ACTIVE, purging anyway because ' '--force was supplied', job_id) else: @@ -644,7 +644,7 @@ def brozzler_purge(argv=None): sys.exit(1) if site.status == 'ACTIVE': if args.force: - logging.warn( + logging.warning( 'site %s has status ACTIVE, purging anyway because ' '--force was supplied', site_id) else: @@ -712,7 +712,7 @@ def brozzler_list_captures(argv=None): if args.url_or_sha1[:5] == 'sha1:': if args.prefix: - logging.warn( + logging.warning( 'ignoring supplied --prefix option which does not apply ' 'to lookup by sha1') # assumes it's already base32 (XXX could detect if hex and convert) diff --git a/brozzler/easy.py b/brozzler/easy.py index 83cf1ba..dd98884 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -260,7 +260,7 @@ class BrozzlerEasyController: state_strs.append(str(th)) stack = traceback.format_stack(sys._current_frames()[th.ident]) state_strs.append(''.join(stack)) - logging.warn('dumping state (caught signal {})\n{}'.format( + logging.warning('dumping state (caught signal {})\n{}'.format( signum, '\n'.join(state_strs))) def main(argv=None): diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 3826abf..0e3b777 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -138,7 +138,7 @@ class RethinkDbFrontier: sites = [] for i in range(result["replaced"]): if result["changes"][i]["old_val"]["claimed"]: - self.logger.warn( + self.logger.warning( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " @@ -225,7 +225,7 @@ class RethinkDbFrontier: if not job: return False if job.status.startswith("FINISH"): - self.logger.warn("%s is already %s", job, job.status) + self.logger.warning("%s is already %s", job, job.status) return True results = self.rr.table("sites").get_all(job_id, index="job_id").run() @@ -415,7 +415,7 @@ class RethinkDbFrontier: assert isinstance(e, brozzler.ReachedLimit) if (site.reached_limit and site.reached_limit != e.warcprox_meta["reached-limit"]): - self.logger.warn( + self.logger.warning( "reached limit %s but site had already reached limit %s", e.warcprox_meta["reached-limit"], self.reached_limit) else: @@ -434,7 +434,7 @@ class RethinkDbFrontier: index="priority_by_site").filter({"hops_from_seed":0}).run() pages = list(results) if len(pages) > 1: - self.logger.warn( + self.logger.warning( "more than one seed page for site_id %s ?", site_id) if len(pages) < 1: return None diff --git a/brozzler/robots.py b/brozzler/robots.py index 5b96423..4122093 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -106,7 +106,7 @@ def is_permitted_by_robots(site, url, proxy=None): # reppy has wrapped an exception that we want to bubble up raise brozzler.ProxyError(e) else: - logging.warn( + logging.warning( "returning true (permitted) after problem fetching " "robots.txt for %r: %r", url, e) return True diff --git a/brozzler/worker.py b/brozzler/worker.py index fba83aa..5ce5499 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -147,13 +147,13 @@ class BrozzlerWorker: try: with urllib.request.urlopen(request, timeout=600) as response: if response.getcode() != 204: - self.logger.warn( + self.logger.warning( 'got "%s %s" response on warcprox ' 'WARCPROX_WRITE_RECORD request (expected 204)', response.getcode(), response.reason) return request, response except urllib.error.HTTPError as e: - self.logger.warn( + self.logger.warning( 'got "%s %s" response on warcprox ' 'WARCPROX_WRITE_RECORD request (expected 204)', e.getcode(), e.info()) @@ -370,7 +370,7 @@ class BrozzlerWorker: if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): - logging.warn("page %s is blocked by robots.txt", page.url) + logging.warning("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: @@ -544,7 +544,7 @@ class BrozzlerWorker: def start(self): with self._start_stop_lock: if self._thread: - self.logger.warn( + self.logger.warning( 'ignoring start request because self._thread is ' 'not None') return diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 57550e5..2388df9 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -48,7 +48,7 @@ _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_rea def _webpage_read_content(self, *args, **kwargs): content = _orig_webpage_read_content(self, *args, **kwargs) if len(content) > 20000000: - logging.warn( + logging.warning( 'bypassing youtube-dl extraction because content is ' 'too large (%s characters)', len(content)) return '' @@ -185,7 +185,7 @@ def _build_youtube_dl(worker, destdir, site): mimetype = magic.from_file(ctx['filename'], mime=True) except ImportError as e: mimetype = 'video/%s' % info_dict['ext'] - self.logger.warn( + self.logger.warning( 'guessing mimetype %s because %r', mimetype, e) url = 'youtube-dl:%05d:%s' % ( diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 13b3c66..c57abb8 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -684,7 +684,7 @@ def test_warcprox_outage_resiliency(httpd): try: stop_service('warcprox') except Exception as e: - logging.warn('problem stopping warcprox service: %s', e) + logging.warning('problem stopping warcprox service: %s', e) # queue the site for brozzling brozzler.new_site(frontier, site) From 8dfd92cf7f15603ea8dcdcaf8efb93c73dc5a65e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 9 Apr 2019 01:44:14 -0700 Subject: [PATCH 22/31] fix this utility --- vagrant/vagrant-brozzler-new-site.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vagrant/vagrant-brozzler-new-site.py b/vagrant/vagrant-brozzler-new-site.py index 158095e..b0a0d80 100755 --- a/vagrant/vagrant-brozzler-new-site.py +++ b/vagrant/vagrant-brozzler-new-site.py @@ -74,9 +74,8 @@ def main(argv=[]): os.chdir(os.path.dirname(__file__)) cmd = ( - '/home/vagrant/brozzler-ve3/bin/python ' - '/home/vagrant/brozzler-ve3/bin/brozzler-new-site ' - '--proxy=localhost:8000 %s %s') % (' '.join(options), args.seed) + '/opt/brozzler-ve3/bin/python /opt/brozzler-ve3/bin/brozzler-new-site ' + '%s %s') % (' '.join(options), args.seed) subprocess.call(['vagrant', 'ssh', '--', cmd]) if __name__ == '__main__': From a2ac3a03740a991740c167887a7b116b4fa07346 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 14 May 2019 12:10:59 +0000 Subject: [PATCH 23/31] logging.warn is deprecated and replaced by logging.warning We replace it everywhere in the code base. --- brozzler/__init__.py | 2 +- brozzler/chrome.py | 8 ++++---- brozzler/cli.py | 6 +++--- brozzler/easy.py | 2 +- brozzler/frontier.py | 8 ++++---- brozzler/robots.py | 2 +- brozzler/worker.py | 8 ++++---- brozzler/ydl.py | 4 ++-- 8 files changed, 20 insertions(+), 20 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index ccfaacd..c97835f 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -159,7 +159,7 @@ class ThreadExceptionGate: def queue_exception(self, e): with self.lock: if self.pending_exception: - self.logger.warn( + self.logger.warning( '%r already pending for thread %r, discarding %r', self.pending_exception, self.thread, e) else: diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 5928586..c70296f 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -223,7 +223,7 @@ class Chrome: raise except Exception as e: if time.time() - self._last_warning > 30: - self.logger.warn( + self.logger.warning( 'problem with %s (will keep trying until timeout ' 'of %d seconds): %s', json_url, timeout_sec, e) self._last_warning = time.time() @@ -294,7 +294,7 @@ class Chrome: 'chrome pid %s exited normally', self.chrome_process.pid) else: - self.logger.warn( + self.logger.warning( 'chrome pid %s exited with nonzero status %s', self.chrome_process.pid, status) @@ -305,13 +305,13 @@ class Chrome: return time.sleep(0.5) - self.logger.warn( + self.logger.warning( 'chrome pid %s still alive %.1f seconds after sending ' 'SIGTERM, sending SIGKILL', self.chrome_process.pid, time.time() - t0) os.killpg(self.chrome_process.pid, signal.SIGKILL) status = self.chrome_process.wait() - self.logger.warn( + self.logger.warning( 'chrome pid %s reaped (status=%s) after killing with ' 'SIGKILL', self.chrome_process.pid, status) diff --git a/brozzler/cli.py b/brozzler/cli.py index 188d591..4b0bd76 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -627,7 +627,7 @@ def brozzler_purge(argv=None): sys.exit(1) if job.status == 'ACTIVE': if args.force: - logging.warn( + logging.warning( 'job %s has status ACTIVE, purging anyway because ' '--force was supplied', job_id) else: @@ -644,7 +644,7 @@ def brozzler_purge(argv=None): sys.exit(1) if site.status == 'ACTIVE': if args.force: - logging.warn( + logging.warning( 'site %s has status ACTIVE, purging anyway because ' '--force was supplied', site_id) else: @@ -712,7 +712,7 @@ def brozzler_list_captures(argv=None): if args.url_or_sha1[:5] == 'sha1:': if args.prefix: - logging.warn( + logging.warning( 'ignoring supplied --prefix option which does not apply ' 'to lookup by sha1') # assumes it's already base32 (XXX could detect if hex and convert) diff --git a/brozzler/easy.py b/brozzler/easy.py index 83cf1ba..dd98884 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -260,7 +260,7 @@ class BrozzlerEasyController: state_strs.append(str(th)) stack = traceback.format_stack(sys._current_frames()[th.ident]) state_strs.append(''.join(stack)) - logging.warn('dumping state (caught signal {})\n{}'.format( + logging.warning('dumping state (caught signal {})\n{}'.format( signum, '\n'.join(state_strs))) def main(argv=None): diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 3826abf..0e3b777 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -138,7 +138,7 @@ class RethinkDbFrontier: sites = [] for i in range(result["replaced"]): if result["changes"][i]["old_val"]["claimed"]: - self.logger.warn( + self.logger.warning( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " @@ -225,7 +225,7 @@ class RethinkDbFrontier: if not job: return False if job.status.startswith("FINISH"): - self.logger.warn("%s is already %s", job, job.status) + self.logger.warning("%s is already %s", job, job.status) return True results = self.rr.table("sites").get_all(job_id, index="job_id").run() @@ -415,7 +415,7 @@ class RethinkDbFrontier: assert isinstance(e, brozzler.ReachedLimit) if (site.reached_limit and site.reached_limit != e.warcprox_meta["reached-limit"]): - self.logger.warn( + self.logger.warning( "reached limit %s but site had already reached limit %s", e.warcprox_meta["reached-limit"], self.reached_limit) else: @@ -434,7 +434,7 @@ class RethinkDbFrontier: index="priority_by_site").filter({"hops_from_seed":0}).run() pages = list(results) if len(pages) > 1: - self.logger.warn( + self.logger.warning( "more than one seed page for site_id %s ?", site_id) if len(pages) < 1: return None diff --git a/brozzler/robots.py b/brozzler/robots.py index 5b96423..4122093 100644 --- a/brozzler/robots.py +++ b/brozzler/robots.py @@ -106,7 +106,7 @@ def is_permitted_by_robots(site, url, proxy=None): # reppy has wrapped an exception that we want to bubble up raise brozzler.ProxyError(e) else: - logging.warn( + logging.warning( "returning true (permitted) after problem fetching " "robots.txt for %r: %r", url, e) return True diff --git a/brozzler/worker.py b/brozzler/worker.py index fba83aa..5ce5499 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -147,13 +147,13 @@ class BrozzlerWorker: try: with urllib.request.urlopen(request, timeout=600) as response: if response.getcode() != 204: - self.logger.warn( + self.logger.warning( 'got "%s %s" response on warcprox ' 'WARCPROX_WRITE_RECORD request (expected 204)', response.getcode(), response.reason) return request, response except urllib.error.HTTPError as e: - self.logger.warn( + self.logger.warning( 'got "%s %s" response on warcprox ' 'WARCPROX_WRITE_RECORD request (expected 204)', e.getcode(), e.info()) @@ -370,7 +370,7 @@ class BrozzlerWorker: if (page.needs_robots_check and not brozzler.is_permitted_by_robots( site, page.url, self._proxy_for(site))): - logging.warn("page %s is blocked by robots.txt", page.url) + logging.warning("page %s is blocked by robots.txt", page.url) page.blocked_by_robots = True self._frontier.completed_page(site, page) else: @@ -544,7 +544,7 @@ class BrozzlerWorker: def start(self): with self._start_stop_lock: if self._thread: - self.logger.warn( + self.logger.warning( 'ignoring start request because self._thread is ' 'not None') return diff --git a/brozzler/ydl.py b/brozzler/ydl.py index 57550e5..2388df9 100644 --- a/brozzler/ydl.py +++ b/brozzler/ydl.py @@ -48,7 +48,7 @@ _orig_webpage_read_content = youtube_dl.extractor.generic.GenericIE._webpage_rea def _webpage_read_content(self, *args, **kwargs): content = _orig_webpage_read_content(self, *args, **kwargs) if len(content) > 20000000: - logging.warn( + logging.warning( 'bypassing youtube-dl extraction because content is ' 'too large (%s characters)', len(content)) return '' @@ -185,7 +185,7 @@ def _build_youtube_dl(worker, destdir, site): mimetype = magic.from_file(ctx['filename'], mime=True) except ImportError as e: mimetype = 'video/%s' % info_dict['ext'] - self.logger.warn( + self.logger.warning( 'guessing mimetype %s because %r', mimetype, e) url = 'youtube-dl:%05d:%s' % ( From a1f91223170ab5b36ea40b3746271102291ee7cb Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Tue, 14 May 2019 16:29:52 +0000 Subject: [PATCH 24/31] Fix test_brozzling::httpd fixture We used `self.headers.getheader` which no longer works. We replace it with `self.headers.get`. We change the code to write binary data to `self.wfile` because we get an exception for writing str and/or None. --- tests/test_brozzling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_brozzling.py b/tests/test_brozzling.py index 0efe5a3..c648f27 100644 --- a/tests/test_brozzling.py +++ b/tests/test_brozzling.py @@ -67,8 +67,8 @@ def httpd(request): self.send_header('WWW-Authenticate', 'Basic realm=\"Test\"') self.send_header('Content-type', 'text/html') self.end_headers() - self.wfile.write(self.headers.getheader('Authorization')) - self.wfile.write('not authenticated') + self.wfile.write(self.headers.get('Authorization', b'')) + self.wfile.write(b'not authenticated') else: super().do_GET() From f8165dc02b0607268ad24f80c48cd49e8ccadf23 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 15 May 2019 18:46:21 -0700 Subject: [PATCH 25/31] work around pytest issue until fix is out https://github.com/pytest-dev/pytest/issues/5257 --- .travis.yml | 2 +- vagrant/run-tests.sh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 0336fdd..a2d8f2c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,7 @@ before_install: install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml - head -999 /etc/service/*/run -- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest +- pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest==4.3.0 - chromium-browser --version - sudo apt-get update - sudo apt-get install --only-upgrade chromium-browser diff --git a/vagrant/run-tests.sh b/vagrant/run-tests.sh index b1ff00b..2d6c117 100755 --- a/vagrant/run-tests.sh +++ b/vagrant/run-tests.sh @@ -17,5 +17,5 @@ vagrant ssh -- 'sudo svstat /etc/service/warcprox ; sudo svstat /etc/service/vnc-websock' echo -vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest && pip install --upgrade --pre "warcprox>=2.1b1.dev86"' -vagrant ssh -- "source /opt/brozzler-ve3/bin/activate && DISPLAY=:1 py.test -v /brozzler/tests $@" +vagrant ssh -- 'set -x ; source /opt/brozzler-ve3/bin/activate && pip install pytest==4.3.0 && pip install --upgrade --pre "warcprox>=2.1b1.dev86"' +vagrant ssh -- "source /opt/brozzler-ve3/bin/activate && DISPLAY=:1 py.test --tb=native -v /brozzler/tests $@" From 0a1360ab25a83f23b5c2ee4483435736e6a2f213 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 15 May 2019 18:49:18 -0700 Subject: [PATCH 26/31] don't use localhost for test http server... ... because apparently sometimes chromium bypasses the proxy for local addresses --- tests/test_cluster.py | 137 +++++++++++++++++++++++++----------------- 1 file changed, 82 insertions(+), 55 deletions(-) diff --git a/tests/test_cluster.py b/tests/test_cluster.py index c57abb8..e04624b 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -34,16 +34,41 @@ import http.server import logging import warcprox +# https://stackoverflow.com/questions/166506/finding-local-ip-addresses-using-pythons-stdlib +def _local_address(): + import socket + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + try: + s.connect(('10.255.255.255', 1)) # ip doesn't need to be reachable + return s.getsockname()[0] + except: + return '127.0.0.1' + finally: + s.close() + +local_address = _local_address() + def start_service(service): subprocess.check_call(['sudo', 'svc', '-u', '/etc/service/' + service]) def stop_service(service): subprocess.check_call(['sudo', 'svc', '-d', '/etc/service/' + service]) + while True: + status = subprocess.check_output( + ['sudo', 'svstat', '/etc/service/' + service]) + if b' down ' in status: + break + time.sleep(0.5) @pytest.fixture(scope='module') def httpd(request): class RequestHandler(http.server.SimpleHTTPRequestHandler): + def do_POST(self): + logging.info('\n%s\n%s', self.requestline, self.headers) + self.do_GET() + def do_GET(self): + logging.info('\n%s\n%s', self.requestline, self.headers) if self.path == '/site5/redirect/': self.send_response(303, 'See other') self.send_header('Connection', 'close') @@ -82,7 +107,7 @@ def httpd(request): # SimpleHTTPRequestHandler always uses CWD so we have to chdir os.chdir(os.path.join(os.path.dirname(__file__), 'htdocs')) - httpd = http.server.HTTPServer(('localhost', 0), RequestHandler) + httpd = http.server.HTTPServer((local_address, 0), RequestHandler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() @@ -94,6 +119,9 @@ def httpd(request): return httpd +def make_url(httpd, rel_url): + return 'http://%s:%s%s' % (local_address, httpd.server_port, rel_url) + def test_httpd(httpd): ''' Tests that our http server is working as expected, and that two fetches @@ -101,7 +129,7 @@ def test_httpd(httpd): deduplication. ''' payload1 = content2 = None - url = 'http://localhost:%s/site1/file1.txt' % httpd.server_port + url = make_url(httpd, '/site1/file1.txt') with urllib.request.urlopen(url) as response: assert response.status == 200 payload1 = response.read() @@ -140,13 +168,13 @@ def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { - 'seed': 'http://localhost:%s/site1/' % httpd.server_port, + 'seed': make_url(httpd, '/site1/'), 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) # the two pages we expect to be crawled - page1 = 'http://localhost:%s/site1/' % httpd.server_port - page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port - robots = 'http://localhost:%s/robots.txt' % httpd.server_port + page1 = make_url(httpd, '/site1/') + page2 = make_url(httpd, '/site1/file1.txt') + robots = make_url(httpd, '/robots.txt') # so we can examine rethinkdb before it does anything try: @@ -171,8 +199,7 @@ def test_brozzle_site(httpd): pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { - 'http://localhost:%s/site1/' % httpd.server_port, - 'http://localhost:%s/site1/file1.txt' % httpd.server_port} + make_url(httpd, '/site1/'), make_url(httpd, '/site1/file1.txt')} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table @@ -255,8 +282,8 @@ def test_proxy_non_warcprox(httpd): start_service('brozzler-worker') assert len(proxy.requests) <= 15 assert proxy.requests.count('GET /status') == 1 - assert ('GET http://localhost:%s/site1/' % httpd.server_port) in proxy.requests - assert ('GET http://localhost:%s/site1/file1.txt' % httpd.server_port) in proxy.requests + assert ('GET %s' % make_url(httpd, '/site1/')) in proxy.requests + assert ('GET %s' % make_url(httpd, '/site1/file1.txt')) in proxy.requests assert [req for req in proxy.requests if req.startswith('WARCPROX_WRITE_RECORD')] == [] proxy.shutdown() @@ -292,14 +319,14 @@ def _test_proxy_setting( datetime.datetime.utcnow().isoformat()) # the two pages we expect to be crawled - page1 = 'http://localhost:%s/site1/' % httpd.server_port - page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port - robots = 'http://localhost:%s/robots.txt' % httpd.server_port + page1 = make_url(httpd, '/site1/') + page2 = make_url(httpd, '/site1/file1.txt') + robots = make_url(httpd, '/robots.txt') rr = doublethink.Rethinker('localhost', db='brozzler') service_registry = doublethink.ServiceRegistry(rr) site = brozzler.Site(rr, { - 'seed': 'http://localhost:%s/site1/' % httpd.server_port, + 'seed': make_url(httpd, '/site1/'), 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) @@ -332,8 +359,8 @@ def _test_proxy_setting( pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { - 'http://localhost:%s/site1/' % httpd.server_port, - 'http://localhost:%s/site1/file1.txt' % httpd.server_port} + make_url(httpd, '/site1/'), + make_url(httpd, '/site1/file1.txt')} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table @@ -360,7 +387,7 @@ def test_obey_robots(httpd): test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { - 'seed': 'http://localhost:%s/site1/' % httpd.server_port, + 'seed': make_url(httpd, '/site1/'), 'user_agent': 'im a badbot', # robots.txt blocks badbot 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) @@ -390,12 +417,12 @@ def test_obey_robots(httpd): pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] - assert page.url == 'http://localhost:%s/site1/' % httpd.server_port + assert page.url == make_url(httpd, '/site1/') assert page.blocked_by_robots # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls - robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port + robots_url = make_url(httpd, '/robots.txt') captures = list(rr.table('captures').filter({'test_id':test_id}).run()) assert len(captures) == 1 assert captures[0]['url'] == robots_url @@ -412,7 +439,7 @@ def test_login(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { - 'seed': 'http://localhost:%s/site2/' % httpd.server_port, + 'seed': make_url(httpd, '/site2/'), 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}, 'username': 'test_username', 'password': 'test_password'}) @@ -428,7 +455,7 @@ def test_login(httpd): # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls - robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port + robots_url = make_url(httpd, '/robots.txt') captures = list(rr.table('captures').filter( {'test_id':test_id}).order_by('timestamp').run()) meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] @@ -436,25 +463,25 @@ def test_login(httpd): # there are several forms in in htdocs/site2/login.html but only one # that brozzler's heuristic should match and try to submit, and it has # action='00', so we can check for that here - assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url + assert ('POST %s' % make_url(httpd, '/site2/00')) in meth_url # sanity check the rest of the crawl - assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url - assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url - assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url - assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url - assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url - assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url - assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url + assert ('GET %s' % make_url(httpd, '/robots.txt')) in meth_url + assert ('GET %s' % make_url(httpd, '/site2/')) in meth_url + assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/')) in meth_url + assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/')) in meth_url + assert ('GET %s' % make_url(httpd, '/site2/login.html')) in meth_url + assert ('WARCPROX_WRITE_RECORD screenshot:%s' % make_url(httpd, '/site2/login.html')) in meth_url + assert ('WARCPROX_WRITE_RECORD thumbnail:%s' % make_url(httpd, '/site2/login.html')) in meth_url def test_seed_redirect(httpd): test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') - seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port + seed_url = make_url(httpd, '/site5/redirect/') site = brozzler.Site(rr, { - 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, + 'seed': make_url(httpd, '/site5/redirect/'), 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) - assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]} + assert site.scope == {'accepts': [{'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}]} frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) @@ -473,19 +500,19 @@ def test_seed_redirect(httpd): pages.sort(key=lambda page: page.hops_from_seed) assert pages[0].hops_from_seed == 0 assert pages[0].url == seed_url - assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port + assert pages[0].redirect_url == make_url(httpd, '/site5/destination/') assert pages[1].hops_from_seed == 1 - assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port + assert pages[1].url == make_url(httpd, '/site5/destination/page2.html') # check that scope has been updated properly assert site.scope == {'accepts': [ - {'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}, - {'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]} + {'ssurt': '%s//%s:http:/site5/redirect/' % (local_address, httpd.server_port)}, + {'ssurt': '%s//%s:http:/site5/destination/' % (local_address, httpd.server_port)}]} def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') - seed_url = 'http://localhost:%s/site7/' % httpd.server_port + seed_url = make_url(httpd, '/site7/') site = brozzler.Site(rr, { 'seed': seed_url, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) @@ -507,9 +534,9 @@ def test_hashtags(httpd): assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 - assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port] + assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site7/foo.html')] assert not pages[0].hashtags - assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port + assert pages[1].url == make_url(httpd, '/site7/foo.html') assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',] @@ -520,18 +547,18 @@ def test_hashtags(httpd): captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} assert seed_url in captures_by_url - assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url - assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url - assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url + assert make_url(httpd, '/site7/foo.html') in captures_by_url + assert make_url(httpd, '/site7/whee.txt') in captures_by_url + assert make_url(httpd, '/site7/boosh.txt') in captures_by_url assert 'screenshot:%s' % seed_url in captures_by_url assert 'thumbnail:%s' % seed_url in captures_by_url - assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url - assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url + assert 'screenshot:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url + assert 'thumbnail:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url def test_redirect_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') - seed_url = 'http://localhost:%s/site9/' % httpd.server_port + seed_url = make_url(httpd, '/site9/') site = brozzler.Site(rr, { 'seed': seed_url, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) @@ -553,9 +580,9 @@ def test_redirect_hashtags(httpd): assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 - assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port] + assert pages[0].outlinks['accepted'] == [make_url(httpd, '/site9/redirect.html')] assert not pages[0].hashtags - assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port + assert pages[1].url == make_url(httpd, '/site9/redirect.html') assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == ['#hash1','#hash2',] @@ -563,7 +590,7 @@ def test_redirect_hashtags(httpd): time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() - redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET'] + redirect_captures = [c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET'] assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags # === expected captures === @@ -589,9 +616,9 @@ def test_stop_crawl(httpd): # create a new job with three sites that could be crawled forever job_conf = {'seeds': [ - {'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port}, - {'url': 'http://localhost:%s/infinite/bar/' % httpd.server_port}, - {'url': 'http://localhost:%s/infinite/baz/' % httpd.server_port}]} + {'url': make_url(httpd, '/infinite/foo/')}, + {'url': make_url(httpd, '/infinite/bar/')}, + {'url': make_url(httpd, '/infinite/baz/')}]} job = brozzler.new_job(frontier, job_conf) assert job.id @@ -675,7 +702,7 @@ def test_warcprox_outage_resiliency(httpd): # put together a site to crawl test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site(rr, { - 'seed': 'http://localhost:%s/infinite/' % httpd.server_port, + 'seed': make_url(httpd, '/infinite/'), 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) try: @@ -771,7 +798,7 @@ def test_time_limit(httpd): # create a new job with one seed that could be crawled forever job_conf = {'seeds': [{ - 'url': 'http://localhost:%s/infinite/foo/' % httpd.server_port, + 'url': make_url(httpd, '/infinite/foo/'), 'time_limit': 20}]} job = brozzler.new_job(frontier, job_conf) assert job.id @@ -801,7 +828,7 @@ def test_ydl_stitching(httpd): rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { - 'seed': 'http://localhost:%s/site10/' % httpd.server_port, + 'seed': make_url(httpd, '/site10/'), 'warcprox_meta': { 'warc-prefix': 'test_ydl_stitching', 'captures-table-extra-fields': {'test_id':test_id}}}) @@ -819,7 +846,7 @@ def test_ydl_stitching(httpd): assert len(pages) == 1 page = pages[0] assert len(page.videos) == 6 - stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port + stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/') assert { 'blame': 'youtube-dl', 'content-length': 267900, From c651bcdd1896a35bde1e6fa1e858fee979de5667 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 16 May 2019 00:21:28 -0700 Subject: [PATCH 27/31] remove some travis-ci debugging stuff --- .travis.yml | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index a2d8f2c..c20872e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,7 +11,6 @@ before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml -- head -999 /etc/service/*/run - pip install $TRAVIS_BUILD_DIR git+https://github.com/internetarchive/warcprox.git#egg=warcprox pytest==4.3.0 - chromium-browser --version - sudo apt-get update @@ -19,16 +18,10 @@ install: - chromium-browser --version - ps ww -fHe - sudo cat /var/log/Xvnc.log -- time DISPLAY=:1 timeout 20 chromium-browser || true -- time DISPLAY=:1 sudo -u brozzler timeout 20 chromium-browser || true -- mkdir -vp /tmp/chium -- time HOME=/tmp/chium DISPLAY=:1 timeout 20 chromium-browser --remote-debugging-port=9222 --use-mock-keychain --user-data-dir=/tmp/chium/chrome-user-data --disable-background-networking --disable-renderer-backgrounding --disable-hang-monitor --disable-background-timer-throttling --mute-audio --disable-web-sockets --window-size=1100,900 --no-default-browser-check --disable-first-run-ui --no-first-run --homepage=about:blank --disable-direct-npapi-requests --disable-web-security --disable-notifications --disable-extensions --disable-save-password-bubble --ignore-certificate-errors --proxy-server=localhost:8000 about:blank || true -- sudo svc -t /etc/service/brozzler-worker -- sleep 10 - sudo cat /var/log/brozzler-worker.log - sudo cat /var/log/warcprox.log script: -- DISPLAY=:1 py.test --tb=native -x -v tests +- DISPLAY=:1 py.test --tb=native -v tests after_failure: - chromium-browser --version - sudo cat /var/log/warcprox.log From f8db17ce3d3a0e76349be4550a3875ddf01e1b78 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 16 May 2019 00:22:29 -0700 Subject: [PATCH 28/31] bump version after merge --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7139950..c474fa2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.4', + version='1.5.5', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From aa2d4910098613e7c595b88681ec11b7ab04c6ac Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 16 May 2019 01:29:05 -0700 Subject: [PATCH 29/31] i don't know where pyyaml 5.8 came from --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 43cef28..1181bea 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.5.5', + version='1.5.6', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -64,7 +64,7 @@ setuptools.setup( ], }, install_requires=[ - 'PyYAML>=5.8', + 'PyYAML>=5.1', 'youtube-dl>=2018.7.21', 'reppy==0.3.4', 'requests>=2.21', From 5fdb2dd39cfbc60e8d7c772cc8c6b6d5951c3c68 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 16 May 2019 14:03:43 -0700 Subject: [PATCH 30/31] documentation tweak --- job-conf.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index fb32513..08707b6 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -339,12 +339,12 @@ Brozzler derives its general approach to the seed surt from `heritrix slash. 2. Canonicalization does not attempt to match heritrix exactly, though it usually does match. -3. When generating a SURT for an HTTPS URL, heritrix changes the scheme to - HTTP. For example, the heritrix SURT for ``https://www.example.com/`` is - ``http://(com,example,www,)`` and this means that all of - ``http://www.example.com/*`` and ``https://www.example.com/*`` are in - scope. It also means that a manually specified SURT with scheme "https" does - not match anything. Brozzler does no scheme munging. +3. Brozzler does no scheme munging. (When generating a SURT for an HTTPS URL, + heritrix changes the scheme to HTTP. For example, the heritrix SURT for + ``https://www.example.com/`` is ``http://(com,example,www,)`` and this means + that all of ``http://www.example.com/*`` and ``https://www.example.com/*`` + are in scope. It also means that a manually specified SURT with scheme + "https" does not match anything.) 4. Brozzler identifies seed "redirects" by retrieving the URL from the browser's location bar at the end of brozzling the seed page, whereas heritrix follows HTTP 3XX redirects. If the URL in the browser From 76b31a7b98e74a1ade0cadaaa63f06616aeb05d0 Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Fri, 17 May 2019 17:03:04 -0700 Subject: [PATCH 31/31] correct typos --- job-conf.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/job-conf.rst b/job-conf.rst index 08707b6..3b4e4b0 100644 --- a/job-conf.rst +++ b/job-conf.rst @@ -2,8 +2,8 @@ Brozzler Job Configuration ************************** Jobs are used to brozzle multiple seeds and/or apply settings and scope rules, -as defined byusing YAML files. At least one seed URL must be specified. -All other configurartions are optional. +and defined using YAML files. At least one seed URL must be specified. +All other configurations are optional. .. contents::