Merge branch 'master' into qa

* master:
  oops, back to dev version number
  wait 20 seconds to claim sites if none were avail-
  tweak logging
  why did those tests fail??? (#117)
  Add screenshots
  Add screenshots
  back to dev version
  1.4 for pypi
  explain --warcprox-auto briefly
  vagrant readme fixes (thanks funkyfuture)
  update cryptography dep version
This commit is contained in:
Noah Levitt 2018-09-04 10:54:26 -07:00
commit c4fdbe578d
12 changed files with 79 additions and 54 deletions

View File

@ -16,7 +16,7 @@ install:
- chromium-browser --version
- sudo service brozzler-worker restart
script:
- DISPLAY=:1 py.test -v tests
- DISPLAY=:1 py.test --tb=native -v tests
after_failure:
- chromium-browser --version
- sudo cat /var/log/upstart/warcprox.log

BIN
Brozzler-Dashboard.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 136 KiB

BIN
Brozzler-Wayback.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

View File

@ -1,4 +1,4 @@
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master
.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=1.4
:target: https://travis-ci.org/internetarchive/brozzler
.. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg
@ -73,7 +73,7 @@ To install brozzler only::
pip install brozzler # in a virtualenv if desired
Launch one or more workers::
Launch one or more workers: [*]_ ::
brozzler-worker --warcprox-auto
@ -85,6 +85,14 @@ Submit sites not tied to a job::
brozzler-new-site --time-limit=600 http://example.com/
.. [*] A note about ``--warcprox-auto``: this option tells brozzler to
look for a healthy warcprox instance in the `rethinkdb service registry
<https://github.com/internetarchive/doublethink#service-registry>`_. For
this to work you need to have at least one instance of warcprox running,
with the ``--rethinkdb-services-url`` option pointing to the same rethinkdb
services table that brozzler is using. Using ``--warcprox-auto`` is
recommended for clustered deployments.
Job Configuration
-----------------
@ -129,6 +137,8 @@ To start the app, run
At this point Brozzler Dashboard will be accessible at http://localhost:8000/.
.. image:: Brozzler-Dashboard.png
See ``brozzler-dashboard --help`` for configuration options.
Brozzler Wayback
@ -170,6 +180,8 @@ Run pywb like so:
Then browse http://localhost:8880/brozzler/.
.. image:: Brozzler-Wayback.png
Headless Chrome (experimental)
------------------------------

View File

@ -17,6 +17,7 @@ See the License for the specific language governing permissions and
limitations under the License.
"""
import logging
from pkg_resources import get_distribution as _get_distribution
__version__ = _get_distribution('brozzler').version
@ -57,18 +58,22 @@ class ReachedLimit(Exception):
def __str__(self):
return self.__repr__()
# monkey-patch log level TRACE
TRACE = 5
import logging
def _logging_trace(msg, *args, **kwargs):
logging.root.trace(msg, *args, **kwargs)
# monkey-patch log levels TRACE and NOTICE
logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2
def _logger_trace(self, msg, *args, **kwargs):
if self.isEnabledFor(TRACE):
self._log(TRACE, msg, args, **kwargs)
logging.trace = _logging_trace
if self.isEnabledFor(logging.TRACE):
self._log(logging.TRACE, msg, args, **kwargs)
logging.Logger.trace = _logger_trace
logging._levelToName[TRACE] = 'TRACE'
logging._nameToLevel['TRACE'] = TRACE
logging.trace = logging.root.trace
logging.addLevelName(logging.TRACE, 'TRACE')
logging.NOTICE = (logging.INFO + logging.WARN) // 2
def _logger_notice(self, msg, *args, **kwargs):
if self.isEnabledFor(logging.NOTICE):
self._log(logging.NOTICE, msg, args, **kwargs)
logging.Logger.notice = _logger_notice
logging.notice = logging.root.notice
logging.addLevelName(logging.NOTICE, 'NOTICE')
# see https://github.com/internetarchive/brozzler/issues/91
def _logging_handler_handle(self, record):

View File

@ -317,7 +317,7 @@ class Browser:
kwargs['id'] = msg_id
msg = json.dumps(kwargs, separators=',:')
logging.log(
brozzler.TRACE if suppress_logging else logging.DEBUG,
logging.TRACE if suppress_logging else logging.DEBUG,
'sending message to %s: %s', self.websock, msg)
self.websock.send(msg)
return msg_id

View File

@ -43,15 +43,14 @@ def add_common_options(arg_parser, argv=None):
argv = argv or sys.argv
arg_parser.add_argument(
'-q', '--quiet', dest='log_level', action='store_const',
default=logging.INFO, const=logging.WARN, help=(
'quiet logging, only warnings and errors'))
default=logging.INFO, const=logging.NOTICE, help='quiet logging')
arg_parser.add_argument(
'-v', '--verbose', dest='log_level', action='store_const',
default=logging.INFO, const=logging.DEBUG, help=(
'verbose logging'))
arg_parser.add_argument(
'--trace', dest='log_level', action='store_const',
default=logging.INFO, const=brozzler.TRACE, help=(
default=logging.INFO, const=logging.TRACE, help=(
'very verbose logging'))
# arg_parser.add_argument(
# '-s', '--silent', dest='log_level', action='store_const',

View File

@ -94,6 +94,7 @@ class RethinkDbFrontier:
k, expected, result))
def claim_sites(self, n=1):
self.logger.trace('claiming up to %s sites to brozzle', n)
result = (
self.rr.table('sites').get_all(r.args(
r.db(self.rr.dbname).table('sites', read_mode='majority')
@ -145,6 +146,7 @@ class RethinkDbFrontier:
result["changes"][i]["old_val"]["last_claimed"])
site = brozzler.Site(self.rr, result["changes"][i]["new_val"])
sites.append(site)
self.logger.debug('claimed %s sites', len(sites))
if sites:
return sites
else:

View File

@ -446,6 +446,7 @@ class BrozzlerWorker:
Raises:
NoBrowsersAvailable if none available
'''
# acquire_multi() raises NoBrowsersAvailable if none available
browsers = self._browser_pool.acquire_multi(
(self._browser_pool.num_available() + 1) // 2)
try:
@ -468,22 +469,26 @@ class BrozzlerWorker:
self._browser_pool.release(browsers[i])
def run(self):
self.logger.info("brozzler worker starting")
self.logger.notice("brozzler worker starting")
last_nothing_to_claim = 0
try:
while not self._shutdown.is_set():
self._service_heartbeat_if_due()
try:
self._start_browsing_some_sites()
except brozzler.browser.NoBrowsersAvailable:
logging.trace(
"all %s browsers are in use", self._max_browsers)
except brozzler.NothingToClaim:
logging.trace(
"all active sites are already claimed by a "
"brozzler worker")
if time.time() - last_nothing_to_claim > 20:
try:
self._start_browsing_some_sites()
except brozzler.browser.NoBrowsersAvailable:
logging.trace(
"all %s browsers are in use",
self._max_browsers)
except brozzler.NothingToClaim:
last_nothing_to_claim = time.time()
logging.trace(
"nothing to claim, all available active sites "
"are already claimed by a brozzler worker")
time.sleep(0.5)
self.logger.info("shutdown requested")
self.logger.notice("shutdown requested")
except r.ReqlError as e:
self.logger.error(
"caught rethinkdb exception, will try to proceed",

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.4.dev297',
version='1.5.dev302',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -63,32 +63,35 @@ setuptools.setup(
],
},
install_requires=[
'PyYAML',
'youtube-dl',
'PyYAML>=3.12',
'youtube-dl>=2018.7.21',
'reppy==0.3.4',
'requests',
'websocket-client!=0.39.0,!=0.49.0',
'python-magic',
'requests>=2.18.4',
'websocket-client>=0.39.0,<=0.48.0',
'pillow>=5.2.0',
'urlcanon>=0.1.dev23',
'doublethink>=0.2.0.dev88',
'rethinkdb>=2.3,<2.4',
'cerberus==1.0.1',
'jinja2',
'cryptography!=2.1.1', # 2.1.1 installation is failing on ubuntu
'python-magic',
'rethinkdb>=2.3',
'cerberus>=1.0.1',
'jinja2>=2.10',
'cryptography>=2.3',
],
extras_require={
'dashboard': ['flask>=0.11', 'gunicorn'],
'dashboard': [
'flask>=0.11',
'gunicorn>=19.8.1'
],
'easy': [
'warcprox>=2.4b2.dev173',
'pywb<2',
'pywb>=0.33.2,<2',
'flask>=0.11',
'gunicorn'
'gunicorn>=19.8.1'
],
},
zip_safe=False,
classifiers=[
'Development Status :: 4 - Beta',
'Development Status :: 5 - Production/Stable',
'Environment :: Console',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 3.4',

View File

@ -53,9 +53,9 @@ def test_run_command(capsys, cmd):
proc = subprocess.Popen(
[cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
out, err = proc.communicate()
assert err == b''
assert out == ('brozzler %s - %s\n' % (
brozzler.__version__, cmd)).encode('ascii')
assert err == b''
def test_rethinkdb_up():
'''Check that rethinkdb is up and running.'''

View File

@ -1,15 +1,14 @@
Single-VM Vagrant Brozzler Deployment
-------------------------------------
This is a work in progress. Vagrant + ansible configuration for a single-vm
deployment of brozzler and warcprox with dependencies (notably rethinkdb).
This is a vagrant + ansible configuration for a single-vm deployment of
brozzler and warcprox with dependencies (notably rethinkdb).
The idea is for this to be a quick way for people to get up and running with a
deployment resembling a real distributed deployment, and to offer a starting
configuration for people to adapt to their clusters.
And equally important, as a harness for integration tests. (As of now brozzler
itself has no automated tests!)
And equally important, as a harness for integration tests.
You'll need vagrant installed.
https://www.vagrantup.com/docs/installation/
@ -25,27 +24,27 @@ the brozzler virtualenv.
::
my-laptop$ vagrant ssh
vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate
(brozzler-ve34)vagrant@brozzler-easy:~$
vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate
(brozzler-ve34)vagrant@brzl:~$
Then you can run brozzler-new-site:
::
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \
--proxy=localhost:8000 http://example.com/
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/
Or brozzler-new-job (make sure to set the proxy to localhost:8000):
::
(brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml
(brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <<EOF
id: job1
proxy: localhost:8000 # point at warcprox for archiving
seeds:
- url: https://example.org/
(brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-job job1.yml
- url: https://example.org/
EOF
(brozzler-ve34)vagrant@brzl:~$ brozzler-new-job job1.yml
WARC files will appear in ./warcs and brozzler, warcprox and rethinkdb logs in
./logs (via vagrant folders syncing).