diff --git a/brozzler/__init__.py b/brozzler/__init__.py index c97835f..3be20f2 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -286,6 +286,9 @@ import datetime EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( tzinfo=doublethink.UTC) +# we could make this configurable if there's a good reason +MAX_PAGE_FAILURES = 3 + from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots from brozzler.frontier import RethinkDbFrontier diff --git a/brozzler/worker.py b/brozzler/worker.py index 4ef3121..e10205b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -405,7 +405,18 @@ class BrozzlerWorker: logging.error( 'proxy error (self._proxy=%r)', self._proxy, exc_info=1) except: - self.logger.critical("unexpected exception", exc_info=True) + self.logger.error( + 'unexpected exception site=%r page=%r', site, page, + exc_info=True) + if page: + page.failed_attempts = (page.failed_attempts or 0) + 1 + if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: + self.logger.info( + 'marking page "completed" after %s unexpected ' + 'exceptions attempting to brozzle %s', + page.failed_attempts, page) + self._frontier.completed_page(site, page) + page = None finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start diff --git a/tests/test_units.py b/tests/test_units.py index 88e4450..5b0295c 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -34,6 +34,7 @@ import socket import time import sys import threading +from unittest import mock logging.basicConfig( stream=sys.stderr, level=logging.INFO, format=( @@ -439,3 +440,55 @@ def test_seed_redirect(): {'ssurt': 'com,foo,//http:/',}, {'ssurt': 'com,bar,//https:/a/b/c',}]} +def test_limit_failures(): + page = mock.Mock() + page.failed_attempts = None + page.brozzle_count = 0 + + site = mock.Mock() + site.status = 'ACTIVE' + site.active_brozzling_time = 0 + site.starts_and_stops = [{'start':datetime.datetime.utcnow()}] + + rr = mock.Mock() + rr.servers = [mock.Mock()] + rethink_query = mock.Mock(run=mock.Mock(return_value=[])) + rr.db_list = mock.Mock(return_value=rethink_query) + rr.table_list = mock.Mock(return_value=rethink_query) + rr.table = mock.Mock( + return_value=mock.Mock( + between=mock.Mock( + return_value=mock.Mock( + limit=mock.Mock( + return_value=rethink_query))))) + assert rr.table().between().limit().run() == [] + frontier = brozzler.RethinkDbFrontier(rr) + frontier.enforce_time_limit = mock.Mock() + frontier.honor_stop_request = mock.Mock() + frontier.claim_page = mock.Mock(return_value=page) + frontier._maybe_finish_job = mock.Mock() + + browser = mock.Mock() + + worker = brozzler.BrozzlerWorker(frontier) + worker.brozzle_page = mock.Mock(side_effect=Exception) + + assert page.failed_attempts is None + assert page.brozzle_count == 0 + assert site.status == 'ACTIVE' + + worker.brozzle_site(browser, site) + assert page.failed_attempts == 1 + assert page.brozzle_count == 0 + assert site.status == 'ACTIVE' + + worker.brozzle_site(browser, site) + assert page.failed_attempts == 2 + assert page.brozzle_count == 0 + assert site.status == 'ACTIVE' + + worker.brozzle_site(browser, site) + assert page.failed_attempts == 3 + assert page.brozzle_count == 1 + assert site.status == 'FINISHED' +