mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-10-02 16:38:39 -04:00
consider page completed after 3 failures
https://github.com/internetarchive/brozzler/pull/183#issuecomment-560562807 "We've had a number of cases where a page kept failing for one reason or another, and it's bad. We can end up with tons of duplicate captures, the crawl is not able to make progress, and the overall performance of the cluster is impacted in cases like yours, where a browser is sitting there doing nothing for five minutes."
This commit is contained in:
parent
060adaffd0
commit
7915220ab7
3 changed files with 68 additions and 1 deletions
|
@ -34,6 +34,7 @@ import socket
|
|||
import time
|
||||
import sys
|
||||
import threading
|
||||
from unittest import mock
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=logging.INFO, format=(
|
||||
|
@ -439,3 +440,55 @@ def test_seed_redirect():
|
|||
{'ssurt': 'com,foo,//http:/',},
|
||||
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
||||
|
||||
def test_limit_failures():
|
||||
page = mock.Mock()
|
||||
page.failed_attempts = None
|
||||
page.brozzle_count = 0
|
||||
|
||||
site = mock.Mock()
|
||||
site.status = 'ACTIVE'
|
||||
site.active_brozzling_time = 0
|
||||
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
|
||||
|
||||
rr = mock.Mock()
|
||||
rr.servers = [mock.Mock()]
|
||||
rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
|
||||
rr.db_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table = mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
between=mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
limit=mock.Mock(
|
||||
return_value=rethink_query)))))
|
||||
assert rr.table().between().limit().run() == []
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
frontier.enforce_time_limit = mock.Mock()
|
||||
frontier.honor_stop_request = mock.Mock()
|
||||
frontier.claim_page = mock.Mock(return_value=page)
|
||||
frontier._maybe_finish_job = mock.Mock()
|
||||
|
||||
browser = mock.Mock()
|
||||
|
||||
worker = brozzler.BrozzlerWorker(frontier)
|
||||
worker.brozzle_page = mock.Mock(side_effect=Exception)
|
||||
|
||||
assert page.failed_attempts is None
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 1
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 2
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 3
|
||||
assert page.brozzle_count == 1
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue