Merge branch 'master' into claim-batches

* master:
  back to dev version number
  commit for beta release
  this should fix travis build?
  fix tests
  update brozzler-easy for current warcprox api
  simpleclicks for minutes PDF
This commit is contained in:
Noah Levitt 2018-02-06 11:46:15 -08:00
commit 9a0941f1fd
6 changed files with 41 additions and 37 deletions

View file

@ -9,7 +9,7 @@ before_install:
- sudo pip install ansible==2.1.3.0 - sudo pip install ansible==2.1.3.0
install: install:
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml - ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
- pip install $TRAVIS_BUILD_DIR 'warcprox==2.3' pytest - pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest
script: script:
- DISPLAY=:1 py.test -v tests - DISPLAY=:1 py.test -v tests
after_failure: after_failure:

View file

@ -5,6 +5,7 @@
become: true become: true
- name: install pywb in virtualenv - name: install pywb in virtualenv
pip: name=pywb pip: name=pywb
version=0.33.2
virtualenv={{venv_root}}/pywb-ve34 virtualenv={{venv_root}}/pywb-ve34
virtualenv_python=python3.4 virtualenv_python=python3.4
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache' extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'

View file

@ -68,6 +68,12 @@
click_css_selector: a[onclick] click_css_selector: a[onclick]
click_until_hard_timeout: False click_until_hard_timeout: False
request_idle_timeout_sec: 10 request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-5294
url_regex: '^https?://citymedfordwi\.civicweb\.net/.*$'
behavior_js_template: umbraBehavior.js.j2
default_parameters:
actions:
- selector: div.meeting-document-type-buttons button.button-small
- # https://webarchive.jira.com/browse/ARI-5409 - # https://webarchive.jira.com/browse/ARI-5409
url_regex: '^https?://(?:www\.)?tuebingen.de/.*$' url_regex: '^https?://(?:www\.)?tuebingen.de/.*$'
behavior_js_template: simpleclicks.js.j2 behavior_js_template: simpleclicks.js.j2

View file

@ -3,7 +3,7 @@
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
working together in a single process working together in a single process
Copyright (C) 2016 Internet Archive Copyright (C) 2016-2018 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
@ -122,8 +122,8 @@ class BrozzlerEasyController:
def __init__(self, args): def __init__(self, args):
self.stop = threading.Event() self.stop = threading.Event()
self.args = args self.args = args
self.warcprox_controller = warcprox.main.init_controller( self.warcprox_controller = warcprox.controller.WarcproxController(
self._warcprox_args(args)) self._warcprox_opts(args))
self.brozzler_worker = self._init_brozzler_worker(args) self.brozzler_worker = self._init_brozzler_worker(args)
self.pywb_httpd = self._init_pywb(args) self.pywb_httpd = self._init_pywb(args)
self.dashboard_httpd = self._init_brozzler_dashboard(args) self.dashboard_httpd = self._init_brozzler_dashboard(args)
@ -221,40 +221,38 @@ class BrozzlerEasyController:
finally: finally:
self.shutdown() self.shutdown()
def _warcprox_args(self, args): def _warcprox_opts(self, args):
''' '''
Takes args as produced by the argument parser built by Takes args as produced by the argument parser built by
_build_arg_parser and builds warcprox arguments object suitable to pass _build_arg_parser and builds warcprox arguments object suitable to pass
to warcprox.main.init_controller. Copies some arguments, renames some, to warcprox.main.init_controller. Copies some arguments, renames some,
populates some with defaults appropriate for brozzler-easy, etc. populates some with defaults appropriate for brozzler-easy, etc.
''' '''
warcprox_args = argparse.Namespace() warcprox_opts = warcprox.Options()
warcprox_args.address = 'localhost' warcprox_opts.address = 'localhost'
# let the OS choose an available port; discover it later using # let the OS choose an available port; discover it later using
# sock.getsockname()[1] # sock.getsockname()[1]
warcprox_args.port = 0 warcprox_opts.port = 0
warcprox_args.cacert = args.cacert warcprox_opts.cacert = args.cacert
warcprox_args.certs_dir = args.certs_dir warcprox_opts.certs_dir = args.certs_dir
warcprox_args.directory = args.warcs_dir warcprox_opts.directory = args.warcs_dir
warcprox_args.gzip = True warcprox_opts.gzip = True
warcprox_args.prefix = 'brozzler' warcprox_opts.prefix = 'brozzler'
warcprox_args.size = 1000 * 1000* 1000 warcprox_opts.size = 1000 * 1000* 1000
warcprox_args.rollover_idle_time = 3 * 60 warcprox_opts.rollover_idle_time = 3 * 60
warcprox_args.digest_algorithm = 'sha1' warcprox_opts.digest_algorithm = 'sha1'
warcprox_args.base32 = True warcprox_opts.base32 = True
warcprox_args.stats_db_file = None warcprox_opts.stats_db_file = None
warcprox_args.playback_port = None warcprox_opts.playback_port = None
warcprox_args.playback_index_db_file = None warcprox_opts.playback_index_db_file = None
warcprox_args.rethinkdb_servers = args.rethinkdb_servers warcprox_opts.rethinkdb_big_table_url = (
warcprox_args.rethinkdb_db = args.rethinkdb_db 'rethinkdb://%s/%s/captures' % (
warcprox_args.rethinkdb_big_table = True args.rethinkdb_servers, args.rethinkdb_db))
warcprox_args.kafka_broker_list = None warcprox_opts.queue_size = 500
warcprox_args.kafka_capture_feed_topic = None warcprox_opts.max_threads = None
warcprox_args.queue_size = 500 warcprox_opts.profile = False
warcprox_args.max_threads = None warcprox_opts.onion_tor_socks_proxy = args.onion_tor_socks_proxy
warcprox_args.profile = False return warcprox_opts
warcprox_args.onion_tor_socks_proxy = args.onion_tor_socks_proxy
return warcprox_args
def dump_state(self, signum=None, frame=None): def dump_state(self, signum=None, frame=None):
state_strs = [] state_strs = []

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b12.dev281', version='1.1b13.dev283',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',
@ -79,8 +79,8 @@ setuptools.setup(
extras_require={ extras_require={
'dashboard': ['flask>=0.11', 'gunicorn'], 'dashboard': ['flask>=0.11', 'gunicorn'],
'easy': [ 'easy': [
'warcprox>=2.1b1.dev87', 'warcprox>=2.4b1.dev145',
'pywb', 'pywb<2',
'flask>=0.11', 'flask>=0.11',
'gunicorn' 'gunicorn'
], ],

View file

@ -661,11 +661,10 @@ def test_warcprox_outage_resiliency(httpd):
opts = warcprox.Options() opts = warcprox.Options()
opts.address = '0.0.0.0' opts.address = '0.0.0.0'
opts.port = 0 opts.port = 0
opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services'
warcprox1 = warcprox.controller.WarcproxController( warcprox1 = warcprox.controller.WarcproxController(opts)
service_registry=svcreg, options=opts) warcprox2 = warcprox.controller.WarcproxController(opts)
warcprox2 = warcprox.controller.WarcproxController(
service_registry=svcreg, options=opts)
warcprox1_thread = threading.Thread( warcprox1_thread = threading.Thread(
target=warcprox1.run_until_shutdown, name='warcprox1') target=warcprox1.run_until_shutdown, name='warcprox1')
warcprox2_thread = threading.Thread( warcprox2_thread = threading.Thread(