consider page completed after 3 failures

https://github.com/internetarchive/brozzler/pull/183#issuecomment-560562807

"We've had a number of cases where a page kept failing for one reason or
another, and it's bad. We can end up with tons of duplicate captures,
the crawl is not able to make progress, and the overall performance of
the cluster is impacted in cases like yours, where a browser is sitting
there doing nothing for five minutes."
This commit is contained in:
Noah Levitt 2019-12-04 12:38:22 -08:00
parent 060adaffd0
commit 7915220ab7
3 changed files with 68 additions and 1 deletions

View File

@ -286,6 +286,9 @@ import datetime
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC)
# we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier

View File

@ -405,7 +405,18 @@ class BrozzlerWorker:
logging.error(
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
except:
self.logger.critical("unexpected exception", exc_info=True)
self.logger.error(
'unexpected exception site=%r page=%r', site, page,
exc_info=True)
if page:
page.failed_attempts = (page.failed_attempts or 0) + 1
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
self.logger.info(
'marking page "completed" after %s unexpected '
'exceptions attempting to brozzle %s',
page.failed_attempts, page)
self._frontier.completed_page(site, page)
page = None
finally:
if start:
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start

View File

@ -34,6 +34,7 @@ import socket
import time
import sys
import threading
from unittest import mock
logging.basicConfig(
stream=sys.stderr, level=logging.INFO, format=(
@ -439,3 +440,55 @@ def test_seed_redirect():
{'ssurt': 'com,foo,//http:/',},
{'ssurt': 'com,bar,//https:/a/b/c',}]}
def test_limit_failures():
page = mock.Mock()
page.failed_attempts = None
page.brozzle_count = 0
site = mock.Mock()
site.status = 'ACTIVE'
site.active_brozzling_time = 0
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
rr = mock.Mock()
rr.servers = [mock.Mock()]
rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
rr.db_list = mock.Mock(return_value=rethink_query)
rr.table_list = mock.Mock(return_value=rethink_query)
rr.table = mock.Mock(
return_value=mock.Mock(
between=mock.Mock(
return_value=mock.Mock(
limit=mock.Mock(
return_value=rethink_query)))))
assert rr.table().between().limit().run() == []
frontier = brozzler.RethinkDbFrontier(rr)
frontier.enforce_time_limit = mock.Mock()
frontier.honor_stop_request = mock.Mock()
frontier.claim_page = mock.Mock(return_value=page)
frontier._maybe_finish_job = mock.Mock()
browser = mock.Mock()
worker = brozzler.BrozzlerWorker(frontier)
worker.brozzle_page = mock.Mock(side_effect=Exception)
assert page.failed_attempts is None
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 1
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 2
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 3
assert page.brozzle_count == 1
assert site.status == 'FINISHED'