mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-12-13 07:38:55 -05:00
Merge branch 'master' into claim-batches
* master: back to dev version number commit for beta release this should fix travis build? fix tests update brozzler-easy for current warcprox api simpleclicks for minutes PDF
This commit is contained in:
commit
9a0941f1fd
6 changed files with 41 additions and 37 deletions
|
|
@ -9,7 +9,7 @@ before_install:
|
||||||
- sudo pip install ansible==2.1.3.0
|
- sudo pip install ansible==2.1.3.0
|
||||||
install:
|
install:
|
||||||
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
- ansible-playbook --extra-vars="brozzler_pip_name=file://$TRAVIS_BUILD_DIR#egg=brozzler user=travis" --inventory-file=ansible/hosts-localhost ansible/playbook.yml
|
||||||
- pip install $TRAVIS_BUILD_DIR 'warcprox==2.3' pytest
|
- pip install $TRAVIS_BUILD_DIR 'warcprox>=2.4b1' pytest
|
||||||
script:
|
script:
|
||||||
- DISPLAY=:1 py.test -v tests
|
- DISPLAY=:1 py.test -v tests
|
||||||
after_failure:
|
after_failure:
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
become: true
|
become: true
|
||||||
- name: install pywb in virtualenv
|
- name: install pywb in virtualenv
|
||||||
pip: name=pywb
|
pip: name=pywb
|
||||||
|
version=0.33.2
|
||||||
virtualenv={{venv_root}}/pywb-ve34
|
virtualenv={{venv_root}}/pywb-ve34
|
||||||
virtualenv_python=python3.4
|
virtualenv_python=python3.4
|
||||||
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
extra_args='--no-input --upgrade --pre --cache-dir=/tmp/pip-cache'
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,12 @@
|
||||||
click_css_selector: a[onclick]
|
click_css_selector: a[onclick]
|
||||||
click_until_hard_timeout: False
|
click_until_hard_timeout: False
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
|
- # https://webarchive.jira.com/browse/ARI-5294
|
||||||
|
url_regex: '^https?://citymedfordwi\.civicweb\.net/.*$'
|
||||||
|
behavior_js_template: umbraBehavior.js.j2
|
||||||
|
default_parameters:
|
||||||
|
actions:
|
||||||
|
- selector: div.meeting-document-type-buttons button.button-small
|
||||||
- # https://webarchive.jira.com/browse/ARI-5409
|
- # https://webarchive.jira.com/browse/ARI-5409
|
||||||
url_regex: '^https?://(?:www\.)?tuebingen.de/.*$'
|
url_regex: '^https?://(?:www\.)?tuebingen.de/.*$'
|
||||||
behavior_js_template: simpleclicks.js.j2
|
behavior_js_template: simpleclicks.js.j2
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
|
brozzler-easy - brozzler-worker, warcprox, pywb, and brozzler-dashboard all
|
||||||
working together in a single process
|
working together in a single process
|
||||||
|
|
||||||
Copyright (C) 2016 Internet Archive
|
Copyright (C) 2016-2018 Internet Archive
|
||||||
|
|
||||||
Licensed under the Apache License, Version 2.0 (the "License");
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
you may not use this file except in compliance with the License.
|
you may not use this file except in compliance with the License.
|
||||||
|
|
@ -122,8 +122,8 @@ class BrozzlerEasyController:
|
||||||
def __init__(self, args):
|
def __init__(self, args):
|
||||||
self.stop = threading.Event()
|
self.stop = threading.Event()
|
||||||
self.args = args
|
self.args = args
|
||||||
self.warcprox_controller = warcprox.main.init_controller(
|
self.warcprox_controller = warcprox.controller.WarcproxController(
|
||||||
self._warcprox_args(args))
|
self._warcprox_opts(args))
|
||||||
self.brozzler_worker = self._init_brozzler_worker(args)
|
self.brozzler_worker = self._init_brozzler_worker(args)
|
||||||
self.pywb_httpd = self._init_pywb(args)
|
self.pywb_httpd = self._init_pywb(args)
|
||||||
self.dashboard_httpd = self._init_brozzler_dashboard(args)
|
self.dashboard_httpd = self._init_brozzler_dashboard(args)
|
||||||
|
|
@ -221,40 +221,38 @@ class BrozzlerEasyController:
|
||||||
finally:
|
finally:
|
||||||
self.shutdown()
|
self.shutdown()
|
||||||
|
|
||||||
def _warcprox_args(self, args):
|
def _warcprox_opts(self, args):
|
||||||
'''
|
'''
|
||||||
Takes args as produced by the argument parser built by
|
Takes args as produced by the argument parser built by
|
||||||
_build_arg_parser and builds warcprox arguments object suitable to pass
|
_build_arg_parser and builds warcprox arguments object suitable to pass
|
||||||
to warcprox.main.init_controller. Copies some arguments, renames some,
|
to warcprox.main.init_controller. Copies some arguments, renames some,
|
||||||
populates some with defaults appropriate for brozzler-easy, etc.
|
populates some with defaults appropriate for brozzler-easy, etc.
|
||||||
'''
|
'''
|
||||||
warcprox_args = argparse.Namespace()
|
warcprox_opts = warcprox.Options()
|
||||||
warcprox_args.address = 'localhost'
|
warcprox_opts.address = 'localhost'
|
||||||
# let the OS choose an available port; discover it later using
|
# let the OS choose an available port; discover it later using
|
||||||
# sock.getsockname()[1]
|
# sock.getsockname()[1]
|
||||||
warcprox_args.port = 0
|
warcprox_opts.port = 0
|
||||||
warcprox_args.cacert = args.cacert
|
warcprox_opts.cacert = args.cacert
|
||||||
warcprox_args.certs_dir = args.certs_dir
|
warcprox_opts.certs_dir = args.certs_dir
|
||||||
warcprox_args.directory = args.warcs_dir
|
warcprox_opts.directory = args.warcs_dir
|
||||||
warcprox_args.gzip = True
|
warcprox_opts.gzip = True
|
||||||
warcprox_args.prefix = 'brozzler'
|
warcprox_opts.prefix = 'brozzler'
|
||||||
warcprox_args.size = 1000 * 1000* 1000
|
warcprox_opts.size = 1000 * 1000* 1000
|
||||||
warcprox_args.rollover_idle_time = 3 * 60
|
warcprox_opts.rollover_idle_time = 3 * 60
|
||||||
warcprox_args.digest_algorithm = 'sha1'
|
warcprox_opts.digest_algorithm = 'sha1'
|
||||||
warcprox_args.base32 = True
|
warcprox_opts.base32 = True
|
||||||
warcprox_args.stats_db_file = None
|
warcprox_opts.stats_db_file = None
|
||||||
warcprox_args.playback_port = None
|
warcprox_opts.playback_port = None
|
||||||
warcprox_args.playback_index_db_file = None
|
warcprox_opts.playback_index_db_file = None
|
||||||
warcprox_args.rethinkdb_servers = args.rethinkdb_servers
|
warcprox_opts.rethinkdb_big_table_url = (
|
||||||
warcprox_args.rethinkdb_db = args.rethinkdb_db
|
'rethinkdb://%s/%s/captures' % (
|
||||||
warcprox_args.rethinkdb_big_table = True
|
args.rethinkdb_servers, args.rethinkdb_db))
|
||||||
warcprox_args.kafka_broker_list = None
|
warcprox_opts.queue_size = 500
|
||||||
warcprox_args.kafka_capture_feed_topic = None
|
warcprox_opts.max_threads = None
|
||||||
warcprox_args.queue_size = 500
|
warcprox_opts.profile = False
|
||||||
warcprox_args.max_threads = None
|
warcprox_opts.onion_tor_socks_proxy = args.onion_tor_socks_proxy
|
||||||
warcprox_args.profile = False
|
return warcprox_opts
|
||||||
warcprox_args.onion_tor_socks_proxy = args.onion_tor_socks_proxy
|
|
||||||
return warcprox_args
|
|
||||||
|
|
||||||
def dump_state(self, signum=None, frame=None):
|
def dump_state(self, signum=None, frame=None):
|
||||||
state_strs = []
|
state_strs = []
|
||||||
|
|
|
||||||
6
setup.py
6
setup.py
|
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b12.dev281',
|
version='1.1b13.dev283',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
@ -79,8 +79,8 @@ setuptools.setup(
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': ['flask>=0.11', 'gunicorn'],
|
||||||
'easy': [
|
'easy': [
|
||||||
'warcprox>=2.1b1.dev87',
|
'warcprox>=2.4b1.dev145',
|
||||||
'pywb',
|
'pywb<2',
|
||||||
'flask>=0.11',
|
'flask>=0.11',
|
||||||
'gunicorn'
|
'gunicorn'
|
||||||
],
|
],
|
||||||
|
|
|
||||||
|
|
@ -661,11 +661,10 @@ def test_warcprox_outage_resiliency(httpd):
|
||||||
opts = warcprox.Options()
|
opts = warcprox.Options()
|
||||||
opts.address = '0.0.0.0'
|
opts.address = '0.0.0.0'
|
||||||
opts.port = 0
|
opts.port = 0
|
||||||
|
opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services'
|
||||||
|
|
||||||
warcprox1 = warcprox.controller.WarcproxController(
|
warcprox1 = warcprox.controller.WarcproxController(opts)
|
||||||
service_registry=svcreg, options=opts)
|
warcprox2 = warcprox.controller.WarcproxController(opts)
|
||||||
warcprox2 = warcprox.controller.WarcproxController(
|
|
||||||
service_registry=svcreg, options=opts)
|
|
||||||
warcprox1_thread = threading.Thread(
|
warcprox1_thread = threading.Thread(
|
||||||
target=warcprox1.run_until_shutdown, name='warcprox1')
|
target=warcprox1.run_until_shutdown, name='warcprox1')
|
||||||
warcprox2_thread = threading.Thread(
|
warcprox2_thread = threading.Thread(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue