mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 08:09:48 -05:00
consider page completed after 3 failures
https://github.com/internetarchive/brozzler/pull/183#issuecomment-560562807 "We've had a number of cases where a page kept failing for one reason or another, and it's bad. We can end up with tons of duplicate captures, the crawl is not able to make progress, and the overall performance of the cluster is impacted in cases like yours, where a browser is sitting there doing nothing for five minutes."
This commit is contained in:
parent
060adaffd0
commit
7915220ab7
@ -286,6 +286,9 @@ import datetime
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||
tzinfo=doublethink.UTC)
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
|
@ -405,7 +405,18 @@ class BrozzlerWorker:
|
||||
logging.error(
|
||||
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
|
||||
except:
|
||||
self.logger.critical("unexpected exception", exc_info=True)
|
||||
self.logger.error(
|
||||
'unexpected exception site=%r page=%r', site, page,
|
||||
exc_info=True)
|
||||
if page:
|
||||
page.failed_attempts = (page.failed_attempts or 0) + 1
|
||||
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
||||
self.logger.info(
|
||||
'marking page "completed" after %s unexpected '
|
||||
'exceptions attempting to brozzle %s',
|
||||
page.failed_attempts, page)
|
||||
self._frontier.completed_page(site, page)
|
||||
page = None
|
||||
finally:
|
||||
if start:
|
||||
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
|
||||
|
@ -34,6 +34,7 @@ import socket
|
||||
import time
|
||||
import sys
|
||||
import threading
|
||||
from unittest import mock
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=logging.INFO, format=(
|
||||
@ -439,3 +440,55 @@ def test_seed_redirect():
|
||||
{'ssurt': 'com,foo,//http:/',},
|
||||
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
||||
|
||||
def test_limit_failures():
|
||||
page = mock.Mock()
|
||||
page.failed_attempts = None
|
||||
page.brozzle_count = 0
|
||||
|
||||
site = mock.Mock()
|
||||
site.status = 'ACTIVE'
|
||||
site.active_brozzling_time = 0
|
||||
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
|
||||
|
||||
rr = mock.Mock()
|
||||
rr.servers = [mock.Mock()]
|
||||
rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
|
||||
rr.db_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table = mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
between=mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
limit=mock.Mock(
|
||||
return_value=rethink_query)))))
|
||||
assert rr.table().between().limit().run() == []
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
frontier.enforce_time_limit = mock.Mock()
|
||||
frontier.honor_stop_request = mock.Mock()
|
||||
frontier.claim_page = mock.Mock(return_value=page)
|
||||
frontier._maybe_finish_job = mock.Mock()
|
||||
|
||||
browser = mock.Mock()
|
||||
|
||||
worker = brozzler.BrozzlerWorker(frontier)
|
||||
worker.brozzle_page = mock.Mock(side_effect=Exception)
|
||||
|
||||
assert page.failed_attempts is None
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 1
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 2
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 3
|
||||
assert page.brozzle_count == 1
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user