From 7915220ab744adbfc1853769c415b5e1ce9bcec8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 4 Dec 2019 12:38:22 -0800 Subject: [PATCH] consider page completed after 3 failures https://github.com/internetarchive/brozzler/pull/183#issuecomment-560562807 "We've had a number of cases where a page kept failing for one reason or another, and it's bad. We can end up with tons of duplicate captures, the crawl is not able to make progress, and the overall performance of the cluster is impacted in cases like yours, where a browser is sitting there doing nothing for five minutes." --- brozzler/__init__.py | 3 +++ brozzler/worker.py | 13 ++++++++++- tests/test_units.py | 53 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index c97835f..3be20f2 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -286,6 +286,9 @@ import datetime EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace( tzinfo=doublethink.UTC) +# we could make this configurable if there's a good reason +MAX_PAGE_FAILURES = 3 + from brozzler.worker import BrozzlerWorker from brozzler.robots import is_permitted_by_robots from brozzler.frontier import RethinkDbFrontier diff --git a/brozzler/worker.py b/brozzler/worker.py index 4ef3121..e10205b 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -405,7 +405,18 @@ class BrozzlerWorker: logging.error( 'proxy error (self._proxy=%r)', self._proxy, exc_info=1) except: - self.logger.critical("unexpected exception", exc_info=True) + self.logger.error( + 'unexpected exception site=%r page=%r', site, page, + exc_info=True) + if page: + page.failed_attempts = (page.failed_attempts or 0) + 1 + if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES: + self.logger.info( + 'marking page "completed" after %s unexpected ' + 'exceptions attempting to brozzle %s', + page.failed_attempts, page) + self._frontier.completed_page(site, page) + page = None finally: if start: site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start diff --git a/tests/test_units.py b/tests/test_units.py index 88e4450..5b0295c 100644 --- a/tests/test_units.py +++ b/tests/test_units.py @@ -34,6 +34,7 @@ import socket import time import sys import threading +from unittest import mock logging.basicConfig( stream=sys.stderr, level=logging.INFO, format=( @@ -439,3 +440,55 @@ def test_seed_redirect(): {'ssurt': 'com,foo,//http:/',}, {'ssurt': 'com,bar,//https:/a/b/c',}]} +def test_limit_failures(): + page = mock.Mock() + page.failed_attempts = None + page.brozzle_count = 0 + + site = mock.Mock() + site.status = 'ACTIVE' + site.active_brozzling_time = 0 + site.starts_and_stops = [{'start':datetime.datetime.utcnow()}] + + rr = mock.Mock() + rr.servers = [mock.Mock()] + rethink_query = mock.Mock(run=mock.Mock(return_value=[])) + rr.db_list = mock.Mock(return_value=rethink_query) + rr.table_list = mock.Mock(return_value=rethink_query) + rr.table = mock.Mock( + return_value=mock.Mock( + between=mock.Mock( + return_value=mock.Mock( + limit=mock.Mock( + return_value=rethink_query))))) + assert rr.table().between().limit().run() == [] + frontier = brozzler.RethinkDbFrontier(rr) + frontier.enforce_time_limit = mock.Mock() + frontier.honor_stop_request = mock.Mock() + frontier.claim_page = mock.Mock(return_value=page) + frontier._maybe_finish_job = mock.Mock() + + browser = mock.Mock() + + worker = brozzler.BrozzlerWorker(frontier) + worker.brozzle_page = mock.Mock(side_effect=Exception) + + assert page.failed_attempts is None + assert page.brozzle_count == 0 + assert site.status == 'ACTIVE' + + worker.brozzle_site(browser, site) + assert page.failed_attempts == 1 + assert page.brozzle_count == 0 + assert site.status == 'ACTIVE' + + worker.brozzle_site(browser, site) + assert page.failed_attempts == 2 + assert page.brozzle_count == 0 + assert site.status == 'ACTIVE' + + worker.brozzle_site(browser, site) + assert page.failed_attempts == 3 + assert page.brozzle_count == 1 + assert site.status == 'FINISHED' +