consider page completed after 3 failures

https://github.com/internetarchive/brozzler/pull/183#issuecomment-560562807

"We've had a number of cases where a page kept failing for one reason or
another, and it's bad. We can end up with tons of duplicate captures,
the crawl is not able to make progress, and the overall performance of
the cluster is impacted in cases like yours, where a browser is sitting
there doing nothing for five minutes."
This commit is contained in:
Noah Levitt 2019-12-04 12:38:22 -08:00
parent 060adaffd0
commit 7915220ab7
3 changed files with 68 additions and 1 deletions

View file

@ -34,6 +34,7 @@ import socket
import time
import sys
import threading
from unittest import mock
logging.basicConfig(
stream=sys.stderr, level=logging.INFO, format=(
@ -439,3 +440,55 @@ def test_seed_redirect():
{'ssurt': 'com,foo,//http:/',},
{'ssurt': 'com,bar,//https:/a/b/c',}]}
def test_limit_failures():
page = mock.Mock()
page.failed_attempts = None
page.brozzle_count = 0
site = mock.Mock()
site.status = 'ACTIVE'
site.active_brozzling_time = 0
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
rr = mock.Mock()
rr.servers = [mock.Mock()]
rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
rr.db_list = mock.Mock(return_value=rethink_query)
rr.table_list = mock.Mock(return_value=rethink_query)
rr.table = mock.Mock(
return_value=mock.Mock(
between=mock.Mock(
return_value=mock.Mock(
limit=mock.Mock(
return_value=rethink_query)))))
assert rr.table().between().limit().run() == []
frontier = brozzler.RethinkDbFrontier(rr)
frontier.enforce_time_limit = mock.Mock()
frontier.honor_stop_request = mock.Mock()
frontier.claim_page = mock.Mock(return_value=page)
frontier._maybe_finish_job = mock.Mock()
browser = mock.Mock()
worker = brozzler.BrozzlerWorker(frontier)
worker.brozzle_page = mock.Mock(side_effect=Exception)
assert page.failed_attempts is None
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 1
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 2
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 3
assert page.brozzle_count == 1
assert site.status == 'FINISHED'