From cae71fffa1605750c5618736fad85f3d59d5e4b7 Mon Sep 17 00:00:00 2001 From: Vangelis Banos Date: Mon, 11 May 2020 22:16:00 +0000 Subject: [PATCH] Define new login behavior type Define new `login-behaviors.yaml` which are JS behaviors we run when the target page is behind a paywall and we need to go to another page to login. The problem with the current system is that the login form needs to be in the target URL. If there isn't a login form, Brozzler cannot login. We try to fix this issue. As a first example, we use www.thetimes.co.uk which is using an approach like that. It is using a paywall and you need to go to account.thetimes.co.uk to login. To implement this, we reuse `brozzler.behavior_script` and `brozzler.behaviors`. We run the new behavior just before `try_login`. Our aim is to go to the login page of the target site before running `try_login`. --- brozzler/__init__.py | 4 ++-- brozzler/browser.py | 13 ++++++++++++- brozzler/js-templates/login-page.js.j2 | 24 ++++++++++++++++++++++++ brozzler/login-behaviors.yaml | 16 ++++++++++++++++ 4 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 brozzler/js-templates/login-page.js.j2 create mode 100644 brozzler/login-behaviors.yaml diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 8a96c8b..753d16d 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -111,12 +111,12 @@ def behaviors(behaviors_dir=None, conf='behaviors.yaml'): _behaviors = yaml.safe_load(fin) return _behaviors -def behavior_script(url, template_parameters=None, behaviors_dir=None): +def behavior_script(url, template_parameters=None, behaviors_dir=None, behaviors=None): ''' Returns the javascript behavior string populated with template_parameters. ''' import re, logging, json - for behavior in behaviors(behaviors_dir=behaviors_dir): + for behavior in behaviors: if re.match(behavior['url_regex'], url): parameters = dict() if 'default_parameters' in behavior: diff --git a/brozzler/browser.py b/brozzler/browser.py index 6c210ff..5488a3d 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -491,6 +491,15 @@ class Browser: user_agent=user_agent) self.navigate_to_page(page_url, timeout=page_timeout) if password: + login_behaviors = brozzler.behaviors( + behaviors_dir=behaviors_dir, conf='login-behaviors.yaml' + ) + login_behavior_script = brozzler.behavior_script( + page_url, behavior_parameters, + behaviors_dir=behaviors_dir, + behaviors=login_behaviors) + self.run_behavior(login_behavior_script, + timeout=behavior_timeout) self.try_login(username, password, timeout=page_timeout) # if login redirected us, return to page_url if page_url != self.url().split('#')[0]: @@ -509,9 +518,11 @@ class Browser: run_behaviors = False if run_behaviors: + behaviors = brozzler.behaviors(behaviors_dir=behaviors_dir) behavior_script = brozzler.behavior_script( page_url, behavior_parameters, - behaviors_dir=behaviors_dir) + behaviors_dir=behaviors_dir, + behaviors=behaviors) self.run_behavior(behavior_script, timeout=behavior_timeout) final_page_url = self.url() if on_screenshot: diff --git a/brozzler/js-templates/login-page.js.j2 b/brozzler/js-templates/login-page.js.j2 new file mode 100644 index 0000000..ee2a27a --- /dev/null +++ b/brozzler/js-templates/login-page.js.j2 @@ -0,0 +1,24 @@ +/* + * brozzler/js-templates/login-page.js.j2 - an umbra/brozzler behavior class + * + * Copyright (C) 2017-2019 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var __brzl_login_page_action = function() { + var actions = {{ actions|json }}; + document.querySelector(actions[0].selector).click(); +}; + +__brzl_login_page_action(); diff --git a/brozzler/login-behaviors.yaml b/brozzler/login-behaviors.yaml new file mode 100644 index 0000000..ddeade3 --- /dev/null +++ b/brozzler/login-behaviors.yaml @@ -0,0 +1,16 @@ +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# first matched behavior is used, so order matters here +- + url_regex: '^https?://(?:www\.)?thetimes\.co\.uk/.*$' + behavior_js_template: login-page.js.j2 + request_idle_timeout_sec: 10 + default_parameters: + actions: + - selector: a[href~='https://login.thetimes.co.uk']