Merge branch 'ARI-6041' into qa

This commit is contained in:
Barbara Miller 2020-03-03 11:48:06 -08:00
commit 12c39050c9
5 changed files with 78 additions and 2 deletions

View File

@ -286,6 +286,9 @@ import datetime
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
tzinfo=doublethink.UTC)
# we could make this configurable if there's a good reason
MAX_PAGE_FAILURES = 3
from brozzler.worker import BrozzlerWorker
from brozzler.robots import is_permitted_by_robots
from brozzler.frontier import RethinkDbFrontier

View File

@ -71,6 +71,15 @@
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
behavior_js_template: psu24.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:.*\.)?icaew\.com/.*$'
behavior_js_template: umbraBehavior.js.j2
default_parameters:
interval: 500
actions:
- selector: a#CybotCookiebotDialogBodyButtonAccept
- selector: .more-link a.cta-link
repeatSameElement: true
-
url_regex: '^https?://(?:www\.)?pm\.gc\.ca/.*$'
behavior_js_template: umbraBehavior.js.j2

View File

@ -405,7 +405,18 @@ class BrozzlerWorker:
logging.error(
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
except:
self.logger.critical("unexpected exception", exc_info=True)
self.logger.error(
'unexpected exception site=%r page=%r', site, page,
exc_info=True)
if page:
page.failed_attempts = (page.failed_attempts or 0) + 1
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
self.logger.info(
'marking page "completed" after %s unexpected '
'exceptions attempting to brozzle %s',
page.failed_attempts, page)
self._frontier.completed_page(site, page)
page = None
finally:
if start:
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.5.17',
version='1.5.18',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -34,6 +34,7 @@ import socket
import time
import sys
import threading
from unittest import mock
logging.basicConfig(
stream=sys.stderr, level=logging.INFO, format=(
@ -439,3 +440,55 @@ def test_seed_redirect():
{'ssurt': 'com,foo,//http:/',},
{'ssurt': 'com,bar,//https:/a/b/c',}]}
def test_limit_failures():
page = mock.Mock()
page.failed_attempts = None
page.brozzle_count = 0
site = mock.Mock()
site.status = 'ACTIVE'
site.active_brozzling_time = 0
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
rr = mock.Mock()
rr.servers = [mock.Mock()]
rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
rr.db_list = mock.Mock(return_value=rethink_query)
rr.table_list = mock.Mock(return_value=rethink_query)
rr.table = mock.Mock(
return_value=mock.Mock(
between=mock.Mock(
return_value=mock.Mock(
limit=mock.Mock(
return_value=rethink_query)))))
assert rr.table().between().limit().run() == []
frontier = brozzler.RethinkDbFrontier(rr)
frontier.enforce_time_limit = mock.Mock()
frontier.honor_stop_request = mock.Mock()
frontier.claim_page = mock.Mock(return_value=page)
frontier._maybe_finish_job = mock.Mock()
browser = mock.Mock()
worker = brozzler.BrozzlerWorker(frontier)
worker.brozzle_page = mock.Mock(side_effect=Exception)
assert page.failed_attempts is None
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 1
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 2
assert page.brozzle_count == 0
assert site.status == 'ACTIVE'
worker.brozzle_site(browser, site)
assert page.failed_attempts == 3
assert page.brozzle_count == 1
assert site.status == 'FINISHED'