diff --git a/.travis.yml b/.travis.yml index 5cb807b..59bd04d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ install: - chromium-browser --version - sudo service brozzler-worker restart script: -- DISPLAY=:1 py.test -v tests +- DISPLAY=:1 py.test --tb=native -v tests after_failure: - chromium-browser --version - sudo cat /var/log/upstart/warcprox.log diff --git a/Brozzler-Dashboard.png b/Brozzler-Dashboard.png new file mode 100644 index 0000000..a7d5d99 Binary files /dev/null and b/Brozzler-Dashboard.png differ diff --git a/Brozzler-Wayback.png b/Brozzler-Wayback.png new file mode 100644 index 0000000..e9e1375 Binary files /dev/null and b/Brozzler-Wayback.png differ diff --git a/README.rst b/README.rst index 2f1e7b6..bf194bb 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=master +.. image:: https://travis-ci.org/internetarchive/brozzler.svg?branch=1.4 :target: https://travis-ci.org/internetarchive/brozzler .. |logo| image:: https://cdn.rawgit.com/internetarchive/brozzler/1.1b12/brozzler/dashboard/static/brozzler.svg @@ -73,7 +73,7 @@ To install brozzler only:: pip install brozzler # in a virtualenv if desired -Launch one or more workers:: +Launch one or more workers: [*]_ :: brozzler-worker --warcprox-auto @@ -85,6 +85,14 @@ Submit sites not tied to a job:: brozzler-new-site --time-limit=600 http://example.com/ +.. [*] A note about ``--warcprox-auto``: this option tells brozzler to + look for a healthy warcprox instance in the `rethinkdb service registry + `_. For + this to work you need to have at least one instance of warcprox running, + with the ``--rethinkdb-services-url`` option pointing to the same rethinkdb + services table that brozzler is using. Using ``--warcprox-auto`` is + recommended for clustered deployments. + Job Configuration ----------------- @@ -129,6 +137,8 @@ To start the app, run At this point Brozzler Dashboard will be accessible at http://localhost:8000/. +.. image:: Brozzler-Dashboard.png + See ``brozzler-dashboard --help`` for configuration options. Brozzler Wayback @@ -170,6 +180,8 @@ Run pywb like so: Then browse http://localhost:8880/brozzler/. +.. image:: Brozzler-Wayback.png + Headless Chrome (experimental) ------------------------------ diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 4da5eda..f93685d 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -17,6 +17,7 @@ See the License for the specific language governing permissions and limitations under the License. """ +import logging from pkg_resources import get_distribution as _get_distribution __version__ = _get_distribution('brozzler').version @@ -57,18 +58,22 @@ class ReachedLimit(Exception): def __str__(self): return self.__repr__() -# monkey-patch log level TRACE -TRACE = 5 -import logging -def _logging_trace(msg, *args, **kwargs): - logging.root.trace(msg, *args, **kwargs) +# monkey-patch log levels TRACE and NOTICE +logging.TRACE = (logging.NOTSET + logging.DEBUG) // 2 def _logger_trace(self, msg, *args, **kwargs): - if self.isEnabledFor(TRACE): - self._log(TRACE, msg, args, **kwargs) -logging.trace = _logging_trace + if self.isEnabledFor(logging.TRACE): + self._log(logging.TRACE, msg, args, **kwargs) logging.Logger.trace = _logger_trace -logging._levelToName[TRACE] = 'TRACE' -logging._nameToLevel['TRACE'] = TRACE +logging.trace = logging.root.trace +logging.addLevelName(logging.TRACE, 'TRACE') + +logging.NOTICE = (logging.INFO + logging.WARN) // 2 +def _logger_notice(self, msg, *args, **kwargs): + if self.isEnabledFor(logging.NOTICE): + self._log(logging.NOTICE, msg, args, **kwargs) +logging.Logger.notice = _logger_notice +logging.notice = logging.root.notice +logging.addLevelName(logging.NOTICE, 'NOTICE') # see https://github.com/internetarchive/brozzler/issues/91 def _logging_handler_handle(self, record): diff --git a/brozzler/browser.py b/brozzler/browser.py index fa633a4..4ee6d82 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -317,7 +317,7 @@ class Browser: kwargs['id'] = msg_id msg = json.dumps(kwargs, separators=',:') logging.log( - brozzler.TRACE if suppress_logging else logging.DEBUG, + logging.TRACE if suppress_logging else logging.DEBUG, 'sending message to %s: %s', self.websock, msg) self.websock.send(msg) return msg_id diff --git a/brozzler/cli.py b/brozzler/cli.py index b1b1e35..15fd905 100644 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -43,15 +43,14 @@ def add_common_options(arg_parser, argv=None): argv = argv or sys.argv arg_parser.add_argument( '-q', '--quiet', dest='log_level', action='store_const', - default=logging.INFO, const=logging.WARN, help=( - 'quiet logging, only warnings and errors')) + default=logging.INFO, const=logging.NOTICE, help='quiet logging') arg_parser.add_argument( '-v', '--verbose', dest='log_level', action='store_const', default=logging.INFO, const=logging.DEBUG, help=( 'verbose logging')) arg_parser.add_argument( '--trace', dest='log_level', action='store_const', - default=logging.INFO, const=brozzler.TRACE, help=( + default=logging.INFO, const=logging.TRACE, help=( 'very verbose logging')) # arg_parser.add_argument( # '-s', '--silent', dest='log_level', action='store_const', diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 2e076d3..5272a88 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -94,6 +94,7 @@ class RethinkDbFrontier: k, expected, result)) def claim_sites(self, n=1): + self.logger.trace('claiming up to %s sites to brozzle', n) result = ( self.rr.table('sites').get_all(r.args( r.db(self.rr.dbname).table('sites', read_mode='majority') @@ -145,6 +146,7 @@ class RethinkDbFrontier: result["changes"][i]["old_val"]["last_claimed"]) site = brozzler.Site(self.rr, result["changes"][i]["new_val"]) sites.append(site) + self.logger.debug('claimed %s sites', len(sites)) if sites: return sites else: diff --git a/brozzler/worker.py b/brozzler/worker.py index f87388d..89437b1 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -446,6 +446,7 @@ class BrozzlerWorker: Raises: NoBrowsersAvailable if none available ''' + # acquire_multi() raises NoBrowsersAvailable if none available browsers = self._browser_pool.acquire_multi( (self._browser_pool.num_available() + 1) // 2) try: @@ -468,22 +469,26 @@ class BrozzlerWorker: self._browser_pool.release(browsers[i]) def run(self): - self.logger.info("brozzler worker starting") + self.logger.notice("brozzler worker starting") + last_nothing_to_claim = 0 try: while not self._shutdown.is_set(): self._service_heartbeat_if_due() - try: - self._start_browsing_some_sites() - except brozzler.browser.NoBrowsersAvailable: - logging.trace( - "all %s browsers are in use", self._max_browsers) - except brozzler.NothingToClaim: - logging.trace( - "all active sites are already claimed by a " - "brozzler worker") + if time.time() - last_nothing_to_claim > 20: + try: + self._start_browsing_some_sites() + except brozzler.browser.NoBrowsersAvailable: + logging.trace( + "all %s browsers are in use", + self._max_browsers) + except brozzler.NothingToClaim: + last_nothing_to_claim = time.time() + logging.trace( + "nothing to claim, all available active sites " + "are already claimed by a brozzler worker") time.sleep(0.5) - self.logger.info("shutdown requested") + self.logger.notice("shutdown requested") except r.ReqlError as e: self.logger.error( "caught rethinkdb exception, will try to proceed", diff --git a/setup.py b/setup.py index fee81ff..a880078 100755 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.4.dev297', + version='1.5.dev302', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -63,32 +63,35 @@ setuptools.setup( ], }, install_requires=[ - 'PyYAML', - 'youtube-dl', + 'PyYAML>=3.12', + 'youtube-dl>=2018.7.21', 'reppy==0.3.4', - 'requests', - 'websocket-client!=0.39.0,!=0.49.0', + 'python-magic', + 'requests>=2.18.4', + 'websocket-client>=0.39.0,<=0.48.0', 'pillow>=5.2.0', 'urlcanon>=0.1.dev23', 'doublethink>=0.2.0.dev88', - 'rethinkdb>=2.3,<2.4', - 'cerberus==1.0.1', - 'jinja2', - 'cryptography!=2.1.1', # 2.1.1 installation is failing on ubuntu - 'python-magic', + 'rethinkdb>=2.3', + 'cerberus>=1.0.1', + 'jinja2>=2.10', + 'cryptography>=2.3', ], extras_require={ - 'dashboard': ['flask>=0.11', 'gunicorn'], + 'dashboard': [ + 'flask>=0.11', + 'gunicorn>=19.8.1' + ], 'easy': [ 'warcprox>=2.4b2.dev173', - 'pywb<2', + 'pywb>=0.33.2,<2', 'flask>=0.11', - 'gunicorn' + 'gunicorn>=19.8.1' ], }, zip_safe=False, classifiers=[ - 'Development Status :: 4 - Beta', + 'Development Status :: 5 - Production/Stable', 'Environment :: Console', 'License :: OSI Approved :: Apache Software License', 'Programming Language :: Python :: 3.4', diff --git a/tests/test_cli.py b/tests/test_cli.py index 0235416..03ec39a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -53,9 +53,9 @@ def test_run_command(capsys, cmd): proc = subprocess.Popen( [cmd, '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) out, err = proc.communicate() + assert err == b'' assert out == ('brozzler %s - %s\n' % ( brozzler.__version__, cmd)).encode('ascii') - assert err == b'' def test_rethinkdb_up(): '''Check that rethinkdb is up and running.''' diff --git a/vagrant/README.rst b/vagrant/README.rst index d9e1545..fdb96bc 100644 --- a/vagrant/README.rst +++ b/vagrant/README.rst @@ -1,15 +1,14 @@ Single-VM Vagrant Brozzler Deployment ------------------------------------- -This is a work in progress. Vagrant + ansible configuration for a single-vm -deployment of brozzler and warcprox with dependencies (notably rethinkdb). +This is a vagrant + ansible configuration for a single-vm deployment of +brozzler and warcprox with dependencies (notably rethinkdb). The idea is for this to be a quick way for people to get up and running with a deployment resembling a real distributed deployment, and to offer a starting configuration for people to adapt to their clusters. -And equally important, as a harness for integration tests. (As of now brozzler -itself has no automated tests!) +And equally important, as a harness for integration tests. You'll need vagrant installed. https://www.vagrantup.com/docs/installation/ @@ -25,27 +24,27 @@ the brozzler virtualenv. :: my-laptop$ vagrant ssh - vagrant@brozzler-easy:~$ source ~/brozzler-ve34/bin/activate - (brozzler-ve34)vagrant@brozzler-easy:~$ + vagrant@brzl:~$ source /opt/brozzler-ve34/bin/activate + (brozzler-ve34)vagrant@brzl:~$ Then you can run brozzler-new-site: :: - (brozzler-ve34)vagrant@brozzler-easy:~$ brozzler-new-site \ - --proxy=localhost:8000 http://example.com/ + (brozzler-ve34)vagrant@brzl:~$ brozzler-new-site --proxy=localhost:8000 http://example.com/ Or brozzler-new-job (make sure to set the proxy to localhost:8000): :: - (brozzler-ve34)vagrant@brozzler-easy:~$ cat >job1.yml + (brozzler-ve34)vagrant@brzl:~$ cat >job1.yml <