From bc21b325d7adeb39dbda1196305a786033941b3c Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Tue, 19 Dec 2017 11:01:25 -0800 Subject: [PATCH 1/6] simpleclicks for minutes PDF --- brozzler/behaviors.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 7ccf698..4d88637 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -68,6 +68,12 @@ click_css_selector: a[onclick] click_until_hard_timeout: False request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-5294 + url_regex: '^https?://citymedfordwi\.civicweb\.net/.*$' + behavior_js_template: umbraBehavior.js.j2 + default_parameters: + actions: + - selector: div.meeting-document-type-buttons button.button-small - # https://webarchive.jira.com/browse/ARI-5409 url_regex: '^https?://(?:www\.)?tuebingen.de/.*$' behavior_js_template: simpleclicks.js.j2 From 5331aca33f350db3d1be383f76da30921dc1ac37 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Feb 2018 14:28:46 -0800 Subject: [PATCH 2/6] update brozzler-easy for current warcprox api --- brozzler/easy.py | 56 +++++++++++++++++++++++------------------------- setup.py | 4 ++-- 2 files changed, 29 insertions(+), 31 deletions(-) diff --git a/brozzler/easy.py b/brozzler/easy.py index e41f013..c9480a1 100644 --- a/brozzler/easy.py +++ b/brozzler/easy.py @@ -3,7 +3,7 @@ brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all working together in a single process -Copyright (C) 2016 Internet Archive +Copyright (C) 2016-2018 Internet Archive Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -122,8 +122,8 @@ class BrozzlerEasyController: def __init__(self, args): self.stop = threading.Event() self.args = args - self.warcprox_controller = warcprox.main.init_controller( - self._warcprox_args(args)) + self.warcprox_controller = warcprox.controller.WarcproxController( + self._warcprox_opts(args)) self.brozzler_worker = self._init_brozzler_worker(args) self.pywb_httpd = self._init_pywb(args) self.dashboard_httpd = self._init_brozzler_dashboard(args) @@ -221,40 +221,38 @@ class BrozzlerEasyController: finally: self.shutdown() - def _warcprox_args(self, args): + def _warcprox_opts(self, args): ''' Takes args as produced by the argument parser built by _build_arg_parser and builds warcprox arguments object suitable to pass to warcprox.main.init_controller. Copies some arguments, renames some, populates some with defaults appropriate for brozzler-easy, etc. ''' - warcprox_args = argparse.Namespace() - warcprox_args.address = 'localhost' + warcprox_opts = warcprox.Options() + warcprox_opts.address = 'localhost' # let the OS choose an available port; discover it later using # sock.getsockname()[1] - warcprox_args.port = 0 - warcprox_args.cacert = args.cacert - warcprox_args.certs_dir = args.certs_dir - warcprox_args.directory = args.warcs_dir - warcprox_args.gzip = True - warcprox_args.prefix = 'brozzler' - warcprox_args.size = 1000 * 1000* 1000 - warcprox_args.rollover_idle_time = 3 * 60 - warcprox_args.digest_algorithm = 'sha1' - warcprox_args.base32 = True - warcprox_args.stats_db_file = None - warcprox_args.playback_port = None - warcprox_args.playback_index_db_file = None - warcprox_args.rethinkdb_servers = args.rethinkdb_servers - warcprox_args.rethinkdb_db = args.rethinkdb_db - warcprox_args.rethinkdb_big_table = True - warcprox_args.kafka_broker_list = None - warcprox_args.kafka_capture_feed_topic = None - warcprox_args.queue_size = 500 - warcprox_args.max_threads = None - warcprox_args.profile = False - warcprox_args.onion_tor_socks_proxy = args.onion_tor_socks_proxy - return warcprox_args + warcprox_opts.port = 0 + warcprox_opts.cacert = args.cacert + warcprox_opts.certs_dir = args.certs_dir + warcprox_opts.directory = args.warcs_dir + warcprox_opts.gzip = True + warcprox_opts.prefix = 'brozzler' + warcprox_opts.size = 1000 * 1000* 1000 + warcprox_opts.rollover_idle_time = 3 * 60 + warcprox_opts.digest_algorithm = 'sha1' + warcprox_opts.base32 = True + warcprox_opts.stats_db_file = None + warcprox_opts.playback_port = None + warcprox_opts.playback_index_db_file = None + warcprox_opts.rethinkdb_big_table_url = ( + 'rethinkdb://%s/%s/captures' % ( + args.rethinkdb_servers, args.rethinkdb_db)) + warcprox_opts.queue_size = 500 + warcprox_opts.max_threads = None + warcprox_opts.profile = False + warcprox_opts.onion_tor_socks_proxy = args.onion_tor_socks_proxy + return warcprox_opts def dump_state(self, signum=None, frame=None): state_strs = [] diff --git a/setup.py b/setup.py index 6c3a66d..7a81fc1 100644 --- a/setup.py +++ b/setup.py @@ -79,8 +79,8 @@ setuptools.setup( extras_require={ 'dashboard': ['flask>=0.11', 'gunicorn'], 'easy': [ - 'warcprox>=2.1b1.dev87', - 'pywb', + 'warcprox>=2.4b1.dev145', + 'pywb<2', 'flask>=0.11', 'gunicorn' ], From 8505720c4198d1684813816a8c48465767ad34f1 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Feb 2018 15:11:26 -0800 Subject: [PATCH 3/6] fix tests --- ansible/roles/pywb/tasks/main.yml | 1 + setup.py | 2 +- tests/test_cluster.py | 7 +++---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/roles/pywb/tasks/main.yml b/ansible/roles/pywb/tasks/main.yml index 975359f..16b9ea7 100644 --- a/ansible/roles/pywb/tasks/main.yml +++ b/ansible/roles/pywb/tasks/main.yml @@ -5,6 +5,7 @@ become: true - name: install pywb in virtualenv pip: name=pywb + version=0.33.2 virtualenv={{venv_root}}/pywb-ve34 virtualenv_python=python3.4 extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' diff --git a/setup.py b/setup.py index 7a81fc1..8c263fc 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev281', + version='1.1b12.dev282', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_cluster.py b/tests/test_cluster.py index 48a9384..32e9734 100644 --- a/tests/test_cluster.py +++ b/tests/test_cluster.py @@ -661,11 +661,10 @@ def test_warcprox_outage_resiliency(httpd): opts = warcprox.Options() opts.address = '0.0.0.0' opts.port = 0 + opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services' - warcprox1 = warcprox.controller.WarcproxController( - service_registry=svcreg, options=opts) - warcprox2 = warcprox.controller.WarcproxController( - service_registry=svcreg, options=opts) + warcprox1 = warcprox.controller.WarcproxController(opts) + warcprox2 = warcprox.controller.WarcproxController(opts) warcprox1_thread = threading.Thread( target=warcprox1.run_until_shutdown, name='warcprox1') warcprox2_thread = threading.Thread( From 9ba58de2926949958313aaa0da43115d2dcedca3 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Feb 2018 16:25:56 -0800 Subject: [PATCH 4/6] this should fix travis build? --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index cef3d80..80b82d4 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,7 +9,7 @@ before_install: - sudo pip install ansible==2.1.3.0 install: - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml -- pip install $TRAVIS_BUILD_DIR 'warcprox==2.3' pytest +- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest script: - DISPLAY=:1 py.test -v tests after_failure: From 2a0ad6d0de01f287c88f0aa5be2df2e42ccf1bfb Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Feb 2018 16:52:42 -0800 Subject: [PATCH 5/6] commit for beta release --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8c263fc..1cbad57 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12.dev282', + version='1.1b12', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', From 95cbfa96e2b31975cbc37125c1f525d21db9495e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Feb 2018 16:54:29 -0800 Subject: [PATCH 6/6] back to dev version number --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1cbad57..5957cd2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b12', + version='1.1b13.dev283', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',