diff --git a/brozzler/__init__.py b/brozzler/__init__.py index ae58cf5..ed9bdea 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -88,6 +88,7 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None): Returns the javascript behavior string populated with template_parameters. ''' import re, logging + timeout_from_behavior = None for behavior in behaviors(behaviors_dir=behaviors_dir): if re.match(behavior['url_regex'], url): parameters = dict() @@ -101,8 +102,10 @@ def behavior_script(url, template_parameters=None, behaviors_dir=None): logging.info( 'using template=%r populated with parameters=%r for %r', behavior['behavior_js_template'], parameters, url) - return script - return None + if 'behavior_timeout_sec' in parameters: + timeout_from_behavior = int(parameters['behavior_timeout_sec']) + return script, timeout_from_behavior + return None, timeout_from_behavior class ThreadExceptionGate: logger = logging.getLogger(__module__ + "." + __qualname__) diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 909a290..1d4c9c6 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -43,8 +43,9 @@ default_parameters: actions: - selector: div.teaser, li.pager__item a -- # https://webarchive.jira.com/browse/ARI-5430 - url_regex: '^https?://www\.careers\.ox\.ac\.uk/.*$' + behavior_timeout_sec: 1800 +- # https://webarchive.jira.com/browse/ARI-5389 + url_regex: '^https?://pitchfork\.com/.*$' behavior_js_template: umbraBehavior.js.j2 default_parameters: actions: diff --git a/brozzler/browser.py b/brozzler/browser.py index 7ab51cd..f35f72b 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -453,9 +453,11 @@ class Browser: if on_screenshot: jpeg_bytes = self.screenshot() on_screenshot(jpeg_bytes) - behavior_script = brozzler.behavior_script( + behavior_script, timeout_from_behavior = brozzler.behavior_script( page_url, behavior_parameters, behaviors_dir=behaviors_dir) + if timeout_from_behavior > behavior_timeout: + behavior_timeout = timeout_from_behavior self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = []