mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'ARI-6041' into qa
This commit is contained in:
commit
12c39050c9
@ -286,6 +286,9 @@ import datetime
|
||||
EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||
tzinfo=doublethink.UTC)
|
||||
|
||||
# we could make this configurable if there's a good reason
|
||||
MAX_PAGE_FAILURES = 3
|
||||
|
||||
from brozzler.worker import BrozzlerWorker
|
||||
from brozzler.robots import is_permitted_by_robots
|
||||
from brozzler.frontier import RethinkDbFrontier
|
||||
|
@ -71,6 +71,15 @@
|
||||
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
|
||||
behavior_js_template: psu24.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:.*\.)?icaew\.com/.*$'
|
||||
behavior_js_template: umbraBehavior.js.j2
|
||||
default_parameters:
|
||||
interval: 500
|
||||
actions:
|
||||
- selector: a#CybotCookiebotDialogBodyButtonAccept
|
||||
- selector: .more-link a.cta-link
|
||||
repeatSameElement: true
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?pm\.gc\.ca/.*$'
|
||||
behavior_js_template: umbraBehavior.js.j2
|
||||
|
@ -405,7 +405,18 @@ class BrozzlerWorker:
|
||||
logging.error(
|
||||
'proxy error (self._proxy=%r)', self._proxy, exc_info=1)
|
||||
except:
|
||||
self.logger.critical("unexpected exception", exc_info=True)
|
||||
self.logger.error(
|
||||
'unexpected exception site=%r page=%r', site, page,
|
||||
exc_info=True)
|
||||
if page:
|
||||
page.failed_attempts = (page.failed_attempts or 0) + 1
|
||||
if page.failed_attempts >= brozzler.MAX_PAGE_FAILURES:
|
||||
self.logger.info(
|
||||
'marking page "completed" after %s unexpected '
|
||||
'exceptions attempting to brozzle %s',
|
||||
page.failed_attempts, page)
|
||||
self._frontier.completed_page(site, page)
|
||||
page = None
|
||||
finally:
|
||||
if start:
|
||||
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.5.17',
|
||||
version='1.5.18',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -34,6 +34,7 @@ import socket
|
||||
import time
|
||||
import sys
|
||||
import threading
|
||||
from unittest import mock
|
||||
|
||||
logging.basicConfig(
|
||||
stream=sys.stderr, level=logging.INFO, format=(
|
||||
@ -439,3 +440,55 @@ def test_seed_redirect():
|
||||
{'ssurt': 'com,foo,//http:/',},
|
||||
{'ssurt': 'com,bar,//https:/a/b/c',}]}
|
||||
|
||||
def test_limit_failures():
|
||||
page = mock.Mock()
|
||||
page.failed_attempts = None
|
||||
page.brozzle_count = 0
|
||||
|
||||
site = mock.Mock()
|
||||
site.status = 'ACTIVE'
|
||||
site.active_brozzling_time = 0
|
||||
site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]
|
||||
|
||||
rr = mock.Mock()
|
||||
rr.servers = [mock.Mock()]
|
||||
rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
|
||||
rr.db_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table_list = mock.Mock(return_value=rethink_query)
|
||||
rr.table = mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
between=mock.Mock(
|
||||
return_value=mock.Mock(
|
||||
limit=mock.Mock(
|
||||
return_value=rethink_query)))))
|
||||
assert rr.table().between().limit().run() == []
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
frontier.enforce_time_limit = mock.Mock()
|
||||
frontier.honor_stop_request = mock.Mock()
|
||||
frontier.claim_page = mock.Mock(return_value=page)
|
||||
frontier._maybe_finish_job = mock.Mock()
|
||||
|
||||
browser = mock.Mock()
|
||||
|
||||
worker = brozzler.BrozzlerWorker(frontier)
|
||||
worker.brozzle_page = mock.Mock(side_effect=Exception)
|
||||
|
||||
assert page.failed_attempts is None
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 1
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 2
|
||||
assert page.brozzle_count == 0
|
||||
assert site.status == 'ACTIVE'
|
||||
|
||||
worker.brozzle_site(browser, site)
|
||||
assert page.failed_attempts == 3
|
||||
assert page.brozzle_count == 1
|
||||
assert site.status == 'FINISHED'
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user