mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-09 06:52:46 -04:00
Merge branch 'master' into qa
* master: oops, back to dev version number wait 20 seconds to claim sites if none were avail- tweak logging why did those tests fail??? (#117) Add screenshots Add screenshots back to dev version 1.4 for pypi explain --warcprox-auto briefly vagrant readme fixes (thanks funkyfuture) update cryptography dep version
This commit is contained in:
commit
c4fdbe578d
12 changed files with 79 additions and 54 deletions
|
@ -16,7 +16,7 @@ install:
|
||||||
- chromium-browser --version
|
- chromium-browser --version
|
||||||
- sudo service brozzler-worker restart
|
- sudo service brozzler-worker restart
|
||||||
script:
|
script:
|
||||||
- DISPLAY=:1 py.test -v tests
|
- DISPLAY=:1 py.test --tb=native -v tests
|
||||||
after_failure:
|
after_failure:
|
||||||
- chromium-browser --version
|
- chromium-browser --version
|
||||||
- sudo cat /var/log/upstart/warcprox.log
|
- sudo cat /var/log/upstart/warcprox.log
|
||||||
|
|
BIN
Brozzler-Dashboard.png
Normal file
BIN
Brozzler-Dashboard.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 136 KiB |
BIN
Brozzler-Wayback.png
Normal file
BIN
Brozzler-Wayback.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 1.1 MiB |
16
README.rst
16
README.rst
|
@ -1,4 +1,4 @@
|
||||||
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
|
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=1.4
|
||||||
:target: https://travis-ci.org/internetarchive/brozzler
|
:target: https://travis-ci.org/internetarchive/brozzler
|
||||||
|
|
||||||
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg
|
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg
|
||||||
|
@ -73,7 +73,7 @@ To install brozzler only::
|
||||||
|
|
||||||
pip install brozzler # in a virtualenv if desired
|
pip install brozzler # in a virtualenv if desired
|
||||||
|
|
||||||
Launch one or more workers::
|
Launch one or more workers: [*]_ ::
|
||||||
|
|
||||||
brozzler-worker --warcprox-auto
|
brozzler-worker --warcprox-auto
|
||||||
|
|
||||||
|
@ -85,6 +85,14 @@ Submit sites not tied to a job::
|
||||||
|
|
||||||
brozzler-new-site --time-limit=600 http://example.com/
|
brozzler-new-site --time-limit=600 http://example.com/
|
||||||
|
|
||||||
|
.. [*] A note about ``--warcprox-auto``: this option tells brozzler to
|
||||||
|
look for a healthy warcprox instance in the `rethinkdb service registry
|
||||||
|
<https://github.com/internetarchive/doublethink#service-registry>`_. For
|
||||||
|
this to work you need to have at least one instance of warcprox running,
|
||||||
|
with the ``--rethinkdb-services-url`` option pointing to the same rethinkdb
|
||||||
|
services table that brozzler is using. Using ``--warcprox-auto`` is
|
||||||
|
recommended for clustered deployments.
|
||||||
|
|
||||||
Job Configuration
|
Job Configuration
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
@ -129,6 +137,8 @@ To start the app, run
|
||||||
|
|
||||||
At this point Brozzler Dashboard will be accessible at http://localhost:8000/.
|
At this point Brozzler Dashboard will be accessible at http://localhost:8000/.
|
||||||
|
|
||||||
|
.. image:: Brozzler-Dashboard.png
|
||||||
|
|
||||||
See ``brozzler-dashboard --help`` for configuration options.
|
See ``brozzler-dashboard --help`` for configuration options.
|
||||||
|
|
||||||
Brozzler Wayback
|
Brozzler Wayback
|
||||||
|
@ -170,6 +180,8 @@ Run pywb like so:
|
||||||
|
|
||||||
Then browse http://localhost:8880/brozzler/.
|
Then browse http://localhost:8880/brozzler/.
|
||||||
|
|
||||||
|
.. image:: Brozzler-Wayback.png
|
||||||
|
|
||||||
Headless Chrome (experimental)
|
Headless Chrome (experimental)
|
||||||
------------------------------
|
------------------------------
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
from pkg_resources import get_distribution as _get_distribution
|
from pkg_resources import get_distribution as _get_distribution
|
||||||
__version__ = _get_distribution('brozzler').version
|
__version__ = _get_distribution('brozzler').version
|
||||||
|
|
||||||
|
@ -57,18 +58,22 @@ class ReachedLimit(Exception):
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.__repr__()
|
return self.__repr__()
|
||||||
|
|
||||||
# monkey-patch log level TRACE
|
# monkey-patch log levels TRACE and NOTICE
|
||||||
TRACE = 5
|
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
|
||||||
import logging
|
|
||||||
def _logging_trace(msg, *args, **kwargs):
|
|
||||||
logging.root.trace(msg, *args, **kwargs)
|
|
||||||
def _logger_trace(self, msg, *args, **kwargs):
|
def _logger_trace(self, msg, *args, **kwargs):
|
||||||
if self.isEnabledFor(TRACE):
|
if self.isEnabledFor(logging.TRACE):
|
||||||
self._log(TRACE, msg, args, **kwargs)
|
self._log(logging.TRACE, msg, args, **kwargs)
|
||||||
logging.trace = _logging_trace
|
|
||||||
logging.Logger.trace = _logger_trace
|
logging.Logger.trace = _logger_trace
|
||||||
logging._levelToName[TRACE] = 'TRACE'
|
logging.trace = logging.root.trace
|
||||||
logging._nameToLevel['TRACE'] = TRACE
|
logging.addLevelName(logging.TRACE, 'TRACE')
|
||||||
|
|
||||||
|
logging.NOTICE = (logging.INFO + logging.WARN) // 2
|
||||||
|
def _logger_notice(self, msg, *args, **kwargs):
|
||||||
|
if self.isEnabledFor(logging.NOTICE):
|
||||||
|
self._log(logging.NOTICE, msg, args, **kwargs)
|
||||||
|
logging.Logger.notice = _logger_notice
|
||||||
|
logging.notice = logging.root.notice
|
||||||
|
logging.addLevelName(logging.NOTICE, 'NOTICE')
|
||||||
|
|
||||||
# see https://github.com/internetarchive/brozzler/issues/91
|
# see https://github.com/internetarchive/brozzler/issues/91
|
||||||
def _logging_handler_handle(self, record):
|
def _logging_handler_handle(self, record):
|
||||||
|
|
|
@ -317,7 +317,7 @@ class Browser:
|
||||||
kwargs['id'] = msg_id
|
kwargs['id'] = msg_id
|
||||||
msg = json.dumps(kwargs, separators=',:')
|
msg = json.dumps(kwargs, separators=',:')
|
||||||
logging.log(
|
logging.log(
|
||||||
brozzler.TRACE if suppress_logging else logging.DEBUG,
|
logging.TRACE if suppress_logging else logging.DEBUG,
|
||||||
'sending message to %s: %s', self.websock, msg)
|
'sending message to %s: %s', self.websock, msg)
|
||||||
self.websock.send(msg)
|
self.websock.send(msg)
|
||||||
return msg_id
|
return msg_id
|
||||||
|
|
|
@ -43,15 +43,14 @@ def add_common_options(arg_parser, argv=None):
|
||||||
argv = argv or sys.argv
|
argv = argv or sys.argv
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-q', '--quiet', dest='log_level', action='store_const',
|
'-q', '--quiet', dest='log_level', action='store_const',
|
||||||
default=logging.INFO, const=logging.WARN, help=(
|
default=logging.INFO, const=logging.NOTICE, help='quiet logging')
|
||||||
'quiet logging, only warnings and errors'))
|
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'-v', '--verbose', dest='log_level', action='store_const',
|
'-v', '--verbose', dest='log_level', action='store_const',
|
||||||
default=logging.INFO, const=logging.DEBUG, help=(
|
default=logging.INFO, const=logging.DEBUG, help=(
|
||||||
'verbose logging'))
|
'verbose logging'))
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--trace', dest='log_level', action='store_const',
|
'--trace', dest='log_level', action='store_const',
|
||||||
default=logging.INFO, const=brozzler.TRACE, help=(
|
default=logging.INFO, const=logging.TRACE, help=(
|
||||||
'very verbose logging'))
|
'very verbose logging'))
|
||||||
# arg_parser.add_argument(
|
# arg_parser.add_argument(
|
||||||
# '-s', '--silent', dest='log_level', action='store_const',
|
# '-s', '--silent', dest='log_level', action='store_const',
|
||||||
|
|
|
@ -94,6 +94,7 @@ class RethinkDbFrontier:
|
||||||
k, expected, result))
|
k, expected, result))
|
||||||
|
|
||||||
def claim_sites(self, n=1):
|
def claim_sites(self, n=1):
|
||||||
|
self.logger.trace('claiming up to %s sites to brozzle', n)
|
||||||
result = (
|
result = (
|
||||||
self.rr.table('sites').get_all(r.args(
|
self.rr.table('sites').get_all(r.args(
|
||||||
r.db(self.rr.dbname).table('sites', read_mode='majority')
|
r.db(self.rr.dbname).table('sites', read_mode='majority')
|
||||||
|
@ -145,6 +146,7 @@ class RethinkDbFrontier:
|
||||||
result["changes"][i]["old_val"]["last_claimed"])
|
result["changes"][i]["old_val"]["last_claimed"])
|
||||||
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
|
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
|
||||||
sites.append(site)
|
sites.append(site)
|
||||||
|
self.logger.debug('claimed %s sites', len(sites))
|
||||||
if sites:
|
if sites:
|
||||||
return sites
|
return sites
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -446,6 +446,7 @@ class BrozzlerWorker:
|
||||||
Raises:
|
Raises:
|
||||||
NoBrowsersAvailable if none available
|
NoBrowsersAvailable if none available
|
||||||
'''
|
'''
|
||||||
|
# acquire_multi() raises NoBrowsersAvailable if none available
|
||||||
browsers = self._browser_pool.acquire_multi(
|
browsers = self._browser_pool.acquire_multi(
|
||||||
(self._browser_pool.num_available() + 1) // 2)
|
(self._browser_pool.num_available() + 1) // 2)
|
||||||
try:
|
try:
|
||||||
|
@ -468,22 +469,26 @@ class BrozzlerWorker:
|
||||||
self._browser_pool.release(browsers[i])
|
self._browser_pool.release(browsers[i])
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
self.logger.info("brozzler worker starting")
|
self.logger.notice("brozzler worker starting")
|
||||||
|
last_nothing_to_claim = 0
|
||||||
try:
|
try:
|
||||||
while not self._shutdown.is_set():
|
while not self._shutdown.is_set():
|
||||||
self._service_heartbeat_if_due()
|
self._service_heartbeat_if_due()
|
||||||
|
if time.time() - last_nothing_to_claim > 20:
|
||||||
try:
|
try:
|
||||||
self._start_browsing_some_sites()
|
self._start_browsing_some_sites()
|
||||||
except brozzler.browser.NoBrowsersAvailable:
|
except brozzler.browser.NoBrowsersAvailable:
|
||||||
logging.trace(
|
logging.trace(
|
||||||
"all %s browsers are in use", self._max_browsers)
|
"all %s browsers are in use",
|
||||||
|
self._max_browsers)
|
||||||
except brozzler.NothingToClaim:
|
except brozzler.NothingToClaim:
|
||||||
|
last_nothing_to_claim = time.time()
|
||||||
logging.trace(
|
logging.trace(
|
||||||
"all active sites are already claimed by a "
|
"nothing to claim, all available active sites "
|
||||||
"brozzler worker")
|
"are already claimed by a brozzler worker")
|
||||||
time.sleep(0.5)
|
time.sleep(0.5)
|
||||||
|
|
||||||
self.logger.info("shutdown requested")
|
self.logger.notice("shutdown requested")
|
||||||
except r.ReqlError as e:
|
except r.ReqlError as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
"caught rethinkdb exception, will try to proceed",
|
"caught rethinkdb exception, will try to proceed",
|
||||||
|
|
31
setup.py
31
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.4.dev297',
|
version='1.5.dev302',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -63,32 +63,35 @@ setuptools.setup(
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
install_requires=[
|
install_requires=[
|
||||||
'PyYAML',
|
'PyYAML>=3.12',
|
||||||
'youtube-dl',
|
'youtube-dl>=2018.7.21',
|
||||||
'reppy==0.3.4',
|
'reppy==0.3.4',
|
||||||
'requests',
|
'python-magic',
|
||||||
'websocket-client!=0.39.0,!=0.49.0',
|
'requests>=2.18.4',
|
||||||
|
'websocket-client>=0.39.0,<=0.48.0',
|
||||||
'pillow>=5.2.0',
|
'pillow>=5.2.0',
|
||||||
'urlcanon>=0.1.dev23',
|
'urlcanon>=0.1.dev23',
|
||||||
'doublethink>=0.2.0.dev88',
|
'doublethink>=0.2.0.dev88',
|
||||||
'rethinkdb>=2.3,<2.4',
|
'rethinkdb>=2.3',
|
||||||
'cerberus==1.0.1',
|
'cerberus>=1.0.1',
|
||||||
'jinja2',
|
'jinja2>=2.10',
|
||||||
'cryptography!=2.1.1', # 2.1.1 installation is failing on ubuntu
|
'cryptography>=2.3',
|
||||||
'python-magic',
|
|
||||||
],
|
],
|
||||||
extras_require={
|
extras_require={
|
||||||
'dashboard': ['flask>=0.11', 'gunicorn'],
|
'dashboard': [
|
||||||
|
'flask>=0.11',
|
||||||
|
'gunicorn>=19.8.1'
|
||||||
|
],
|
||||||
'easy': [
|
'easy': [
|
||||||
'warcprox>=2.4b2.dev173',
|
'warcprox>=2.4b2.dev173',
|
||||||
'pywb<2',
|
'pywb>=0.33.2,<2',
|
||||||
'flask>=0.11',
|
'flask>=0.11',
|
||||||
'gunicorn'
|
'gunicorn>=19.8.1'
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
zip_safe=False,
|
zip_safe=False,
|
||||||
classifiers=[
|
classifiers=[
|
||||||
'Development Status :: 4 - Beta',
|
'Development Status :: 5 - Production/Stable',
|
||||||
'Environment :: Console',
|
'Environment :: Console',
|
||||||
'License :: OSI Approved :: Apache Software License',
|
'License :: OSI Approved :: Apache Software License',
|
||||||
'Programming Language :: Python :: 3.4',
|
'Programming Language :: Python :: 3.4',
|
||||||
|
|
|
@ -53,9 +53,9 @@ def test_run_command(capsys, cmd):
|
||||||
proc = subprocess.Popen(
|
proc = subprocess.Popen(
|
||||||
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
out, err = proc.communicate()
|
out, err = proc.communicate()
|
||||||
|
assert err == b''
|
||||||
assert out == ('brozzler %s - %s\n' % (
|
assert out == ('brozzler %s - %s\n' % (
|
||||||
brozzler.__version__, cmd)).encode('ascii')
|
brozzler.__version__, cmd)).encode('ascii')
|
||||||
assert err == b''
|
|
||||||
|
|
||||||
def test_rethinkdb_up():
|
def test_rethinkdb_up():
|
||||||
'''Check that rethinkdb is up and running.'''
|
'''Check that rethinkdb is up and running.'''
|
||||||
|
|
|
@ -1,15 +1,14 @@
|
||||||
Single-VM Vagrant Brozzler Deployment
|
Single-VM Vagrant Brozzler Deployment
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
This is a work in progress. Vagrant + ansible configuration for a single-vm
|
This is a vagrant + ansible configuration for a single-vm deployment of
|
||||||
deployment of brozzler and warcprox with dependencies (notably rethinkdb).
|
brozzler and warcprox with dependencies (notably rethinkdb).
|
||||||
|
|
||||||
The idea is for this to be a quick way for people to get up and running with a
|
The idea is for this to be a quick way for people to get up and running with a
|
||||||
deployment resembling a real distributed deployment, and to offer a starting
|
deployment resembling a real distributed deployment, and to offer a starting
|
||||||
configuration for people to adapt to their clusters.
|
configuration for people to adapt to their clusters.
|
||||||
|
|
||||||
And equally important, as a harness for integration tests. (As of now brozzler
|
And equally important, as a harness for integration tests.
|
||||||
itself has no automated tests!)
|
|
||||||
|
|
||||||
You'll need vagrant installed.
|
You'll need vagrant installed.
|
||||||
https://www.vagrantup.com/docs/installation/
|
https://www.vagrantup.com/docs/installation/
|
||||||
|
@ -25,27 +24,27 @@ the brozzler virtualenv.
|
||||||
::
|
::
|
||||||
|
|
||||||
my-laptop$ vagrant ssh
|
my-laptop$ vagrant ssh
|
||||||
vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate
|
vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate
|
||||||
(brozzler-ve34)vagrant@brozzler-easy:~$
|
(brozzler-ve34)vagrant@brzl:~$
|
||||||
|
|
||||||
Then you can run brozzler-new-site:
|
Then you can run brozzler-new-site:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
|
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
|
||||||
--proxy=localhost:8000 http://example.com/
|
|
||||||
|
|
||||||
|
|
||||||
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
|
(brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <<EOF
|
||||||
id: job1
|
id: job1
|
||||||
proxy: localhost:8000 # point at warcprox for archiving
|
proxy: localhost:8000 # point at warcprox for archiving
|
||||||
seeds:
|
seeds:
|
||||||
- url: https://example.org/
|
- url: https://example.org/
|
||||||
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
|
EOF
|
||||||
|
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-job job1.yml
|
||||||
|
|
||||||
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
|
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
|
||||||
./logs (via vagrant folders syncing).
|
./logs (via vagrant folders syncing).
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue