mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Define new login behavior type
Define new `login-behaviors.yaml` which are JS behaviors we run when the target page is behind a paywall and we need to go to another page to login. The problem with the current system is that the login form needs to be in the target URL. If there isn't a login form, Brozzler cannot login. We try to fix this issue. As a first example, we use www.thetimes.co.uk which is using an approach like that. It is using a paywall and you need to go to account.thetimes.co.uk to login. To implement this, we reuse `brozzler.behavior_script` and `brozzler.behaviors`. We run the new behavior just before `try_login`. Our aim is to go to the login page of the target site before running `try_login`.
This commit is contained in:
parent
5307760aaf
commit
cae71fffa1
@ -111,12 +111,12 @@ def behaviors(behaviors_dir=None, conf='behaviors.yaml'):
|
||||
_behaviors = yaml.safe_load(fin)
|
||||
return _behaviors
|
||||
|
||||
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||
def behavior_script(url, template_parameters=None, behaviors_dir=None, behaviors=None):
|
||||
'''
|
||||
Returns the javascript behavior string populated with template_parameters.
|
||||
'''
|
||||
import re, logging, json
|
||||
for behavior in behaviors(behaviors_dir=behaviors_dir):
|
||||
for behavior in behaviors:
|
||||
if re.match(behavior['url_regex'], url):
|
||||
parameters = dict()
|
||||
if 'default_parameters' in behavior:
|
||||
|
@ -491,6 +491,15 @@ class Browser:
|
||||
user_agent=user_agent)
|
||||
self.navigate_to_page(page_url, timeout=page_timeout)
|
||||
if password:
|
||||
login_behaviors = brozzler.behaviors(
|
||||
behaviors_dir=behaviors_dir, conf='login-behaviors.yaml'
|
||||
)
|
||||
login_behavior_script = brozzler.behavior_script(
|
||||
page_url, behavior_parameters,
|
||||
behaviors_dir=behaviors_dir,
|
||||
behaviors=login_behaviors)
|
||||
self.run_behavior(login_behavior_script,
|
||||
timeout=behavior_timeout)
|
||||
self.try_login(username, password, timeout=page_timeout)
|
||||
# if login redirected us, return to page_url
|
||||
if page_url != self.url().split('#')[0]:
|
||||
@ -509,9 +518,11 @@ class Browser:
|
||||
run_behaviors = False
|
||||
|
||||
if run_behaviors:
|
||||
behaviors = brozzler.behaviors(behaviors_dir=behaviors_dir)
|
||||
behavior_script = brozzler.behavior_script(
|
||||
page_url, behavior_parameters,
|
||||
behaviors_dir=behaviors_dir)
|
||||
behaviors_dir=behaviors_dir,
|
||||
behaviors=behaviors)
|
||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||
final_page_url = self.url()
|
||||
if on_screenshot:
|
||||
|
24
brozzler/js-templates/login-page.js.j2
Normal file
24
brozzler/js-templates/login-page.js.j2
Normal file
@ -0,0 +1,24 @@
|
||||
/*
|
||||
* brozzler/js-templates/login-page.js.j2 - an umbra/brozzler behavior class
|
||||
*
|
||||
* Copyright (C) 2017-2019 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var __brzl_login_page_action = function() {
|
||||
var actions = {{ actions|json }};
|
||||
document.querySelector(actions[0].selector).click();
|
||||
};
|
||||
|
||||
__brzl_login_page_action();
|
16
brozzler/login-behaviors.yaml
Normal file
16
brozzler/login-behaviors.yaml
Normal file
@ -0,0 +1,16 @@
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
# first matched behavior is used, so order matters here
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?thetimes\.co\.uk/.*$'
|
||||
behavior_js_template: login-page.js.j2
|
||||
request_idle_timeout_sec: 10
|
||||
default_parameters:
|
||||
actions:
|
||||
- selector: a[href~='https://login.thetimes.co.uk']
|
Loading…
x
Reference in New Issue
Block a user