diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 8a96c8b..753d16d 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -111,12 +111,12 @@ def behaviors(behaviors_dir=None, conf='behaviors.yaml'): _behaviors = yaml.safe_load(fin) return _behaviors -def behavior_script(url, template_parameters=None, behaviors_dir=None): +def behavior_script(url, template_parameters=None, behaviors_dir=None, behaviors=None): ''' Returns the javascript behavior string populated with template_parameters. ''' import re, logging, json - for behavior in behaviors(behaviors_dir=behaviors_dir): + for behavior in behaviors: if re.match(behavior['url_regex'], url): parameters = dict() if 'default_parameters' in behavior: diff --git a/brozzler/browser.py b/brozzler/browser.py index 6c210ff..5488a3d 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -491,6 +491,15 @@ class Browser: user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: + login_behaviors = brozzler.behaviors( + behaviors_dir=behaviors_dir, conf='login-behaviors.yaml' + ) + login_behavior_script = brozzler.behavior_script( + page_url, behavior_parameters, + behaviors_dir=behaviors_dir, + behaviors=login_behaviors) + self.run_behavior(login_behavior_script, + timeout=behavior_timeout) self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: @@ -509,9 +518,11 @@ class Browser: run_behaviors = False if run_behaviors: + behaviors = brozzler.behaviors(behaviors_dir=behaviors_dir) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, - behaviors_dir=behaviors_dir) + behaviors_dir=behaviors_dir, + behaviors=behaviors) self.run_behavior(behavior_script, timeout=behavior_timeout) final_page_url = self.url() if on_screenshot: diff --git a/brozzler/js-templates/login-page.js.j2 b/brozzler/js-templates/login-page.js.j2 new file mode 100644 index 0000000..ee2a27a --- /dev/null +++ b/brozzler/js-templates/login-page.js.j2 @@ -0,0 +1,24 @@ +/* + * brozzler/js-templates/login-page.js.j2 - an umbra/brozzler behavior class + * + * Copyright (C) 2017-2019 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var __brzl_login_page_action = function() { + var actions = {{ actions|json }}; + document.querySelector(actions[0].selector).click(); +}; + +__brzl_login_page_action(); diff --git a/brozzler/login-behaviors.yaml b/brozzler/login-behaviors.yaml new file mode 100644 index 0000000..ddeade3 --- /dev/null +++ b/brozzler/login-behaviors.yaml @@ -0,0 +1,16 @@ +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# first matched behavior is used, so order matters here +- + url_regex: '^https?://(?:www\.)?thetimes\.co\.uk/.*$' + behavior_js_template: login-page.js.j2 + request_idle_timeout_sec: 10 + default_parameters: + actions: + - selector: a[href~='https://login.thetimes.co.uk']