mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 07:45:50 -04:00
convert behaviors to jinja2, move them to new subdir js-templates, along with javascript previously stored as a string in browser.py
This commit is contained in:
parent
06fd0a0d79
commit
a0b61408b9
@ -86,23 +86,7 @@ def behaviors():
|
||||
behaviors_yaml = os.path.join(
|
||||
os.path.dirname(__file__), 'behaviors.yaml')
|
||||
with open(behaviors_yaml) as fin:
|
||||
conf = yaml.load(fin)
|
||||
_behaviors = conf['behaviors']
|
||||
|
||||
for behavior in _behaviors:
|
||||
if 'behavior_js' in behavior:
|
||||
behavior_js = os.path.join(
|
||||
os.path.dirname(__file__), 'behaviors.d',
|
||||
behavior['behavior_js'])
|
||||
with open(behavior_js, encoding='utf-8') as fin:
|
||||
behavior['script'] = fin.read()
|
||||
elif 'behavior_js_template' in behavior:
|
||||
behavior_js_template = os.path.join(
|
||||
os.path.dirname(__file__), 'behaviors.d',
|
||||
behavior['behavior_js_template'])
|
||||
with open(behavior_js_template, encoding='utf-8') as fin:
|
||||
behavior['template'] = string.Template(fin.read())
|
||||
|
||||
_behaviors = yaml.load(fin)
|
||||
return _behaviors
|
||||
|
||||
def behavior_script(url, template_parameters=None):
|
||||
@ -112,22 +96,18 @@ def behavior_script(url, template_parameters=None):
|
||||
import re, logging
|
||||
for behavior in behaviors():
|
||||
if re.match(behavior['url_regex'], url):
|
||||
if 'behavior_js' in behavior:
|
||||
logging.info(
|
||||
'using behavior %s for %s',
|
||||
behavior['behavior_js'], url)
|
||||
return behavior['script']
|
||||
elif 'behavior_js_template' in behavior:
|
||||
parameters = dict()
|
||||
if 'default_parameters' in behavior:
|
||||
parameters.update(behavior['default_parameters'])
|
||||
if template_parameters:
|
||||
parameters.update(template_parameters)
|
||||
script = behavior['template'].safe_substitute(parameters)
|
||||
logging.info(
|
||||
'using template=%s populated with parameters=%s for %s',
|
||||
repr(behavior['behavior_js_template']), parameters, url)
|
||||
return script
|
||||
parameters = dict()
|
||||
if 'default_parameters' in behavior:
|
||||
parameters.update(behavior['default_parameters'])
|
||||
if template_parameters:
|
||||
parameters.update(template_parameters)
|
||||
template = jinja2_environment().get_template(
|
||||
behavior['behavior_js_template'])
|
||||
script = template.render(parameters)
|
||||
logging.info(
|
||||
'using template=%s populated with parameters=%s for %s',
|
||||
repr(behavior['behavior_js_template']), parameters, url)
|
||||
return script
|
||||
return None
|
||||
|
||||
def thread_raise(thread, exctype):
|
||||
|
@ -17,83 +17,82 @@
|
||||
#
|
||||
|
||||
# first matched behavior is used, so order matters here
|
||||
behaviors:
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
|
||||
behavior_js_template: facebook.js.template
|
||||
# default_parameters:
|
||||
# parameter_username: jdoe@example.com
|
||||
# parameter_password: abcd1234
|
||||
request_idle_timeout_sec: 30
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
|
||||
behavior_js: marquette_edu.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?vimeo\.com/.*$'
|
||||
behavior_js: vimeo.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
|
||||
behavior_js: psu24.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
|
||||
behavior_js: instagram.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
|
||||
behavior_js_template: simpleclicks.js.template
|
||||
default_parameters:
|
||||
click_css_selector: img.img-responsive
|
||||
request_idle_timeout_sec: 10
|
||||
- # acalog https://webarchive.jira.com/browse/ARI-3775
|
||||
url_regex: '^https?://.*[?&]catoid=[^?]*$'
|
||||
behavior_js_template: simpleclicks.js.template
|
||||
default_parameters:
|
||||
click_css_selector: a[onclick]
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-3956
|
||||
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
|
||||
behavior_js_template: simpleclicks.js.template
|
||||
default_parameters:
|
||||
click_css_selector: a[id='feature-next']
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/AITFIVE-451
|
||||
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
|
||||
behavior_js_template: simpleclicks.js.template
|
||||
default_parameters:
|
||||
click_css_selector: button.sc-button-play, button.playButton
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/AITFIVE-463
|
||||
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
|
||||
behavior_js_template: simpleclicks.js.template
|
||||
default_parameters:
|
||||
click_css_selector: button.playButton.medium
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4690
|
||||
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
|
||||
behavior_js_template: simpleclicks.js.template
|
||||
default_parameters:
|
||||
click_css_selector: span.load-more-text
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4725
|
||||
url_regex: '^https?://(?:www\.)?moma.org/.*$'
|
||||
behavior_js_template: simpleclicks.js.template
|
||||
default_parameters:
|
||||
click_css_selector: button[data-more-results-bottom-button]
|
||||
click_until_hard_timeout: True
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4692
|
||||
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
|
||||
behavior_js: fec_gov.js
|
||||
request_idle_timeout_sec: 10
|
||||
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
|
||||
behavior_js_template: mouseovers.js.template
|
||||
default_parameters:
|
||||
mouseover_css_selector: .menu-item a
|
||||
request_idle_timeout_sec: 10
|
||||
- # default fallback behavior
|
||||
url_regex: '^.*$'
|
||||
request_idle_timeout_sec: 10
|
||||
behavior_js: default.js
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
|
||||
behavior_js_template: facebook.js.j2
|
||||
# default_parameters:
|
||||
# parameter_username: jdoe@example.com
|
||||
# parameter_password: abcd1234
|
||||
request_idle_timeout_sec: 30
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
|
||||
behavior_js_template: marquette_edu.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?vimeo\.com/.*$'
|
||||
behavior_js_template: vimeo.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
|
||||
behavior_js_template: psu24.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
|
||||
behavior_js_template: instagram.js
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
default_parameters:
|
||||
click_css_selector: img.img-responsive
|
||||
request_idle_timeout_sec: 10
|
||||
- # acalog https://webarchive.jira.com/browse/ARI-3775
|
||||
url_regex: '^https?://.*[?&]catoid=[^?]*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
default_parameters:
|
||||
click_css_selector: a[onclick]
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-3956
|
||||
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
default_parameters:
|
||||
click_css_selector: a[id='feature-next']
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/AITFIVE-451
|
||||
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
default_parameters:
|
||||
click_css_selector: button.sc-button-play, button.playButton
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/AITFIVE-463
|
||||
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
default_parameters:
|
||||
click_css_selector: button.playButton.medium
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4690
|
||||
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
default_parameters:
|
||||
click_css_selector: span.load-more-text
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4725
|
||||
url_regex: '^https?://(?:www\.)?moma.org/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
default_parameters:
|
||||
click_css_selector: button[data-more-results-bottom-button]
|
||||
click_until_hard_timeout: True
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4692
|
||||
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
|
||||
behavior_js_template: fec_gov.js
|
||||
request_idle_timeout_sec: 10
|
||||
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
|
||||
behavior_js_template: mouseovers.js.j2
|
||||
default_parameters:
|
||||
mouseover_css_selector: .menu-item a
|
||||
request_idle_timeout_sec: 10
|
||||
- # default fallback behavior
|
||||
url_regex: '^.*$'
|
||||
request_idle_timeout_sec: 10
|
||||
behavior_js_template: default.js
|
||||
|
@ -420,30 +420,13 @@ class Browser:
|
||||
lambda: self.websock_thread.got_page_load_event,
|
||||
timeout=timeout)
|
||||
|
||||
OUTLINKS_JS = r'''
|
||||
var __brzl_framesDone = new Set();
|
||||
var __brzl_compileOutlinks = function(frame) {
|
||||
__brzl_framesDone.add(frame);
|
||||
if (frame && frame.document) {
|
||||
var outlinks = Array.prototype.slice.call(
|
||||
frame.document.querySelectorAll('a[href]'));
|
||||
for (var i = 0; i < frame.frames.length; i++) {
|
||||
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
|
||||
outlinks = outlinks.concat(
|
||||
__brzl_compileOutlinks(frame.frames[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
return outlinks;
|
||||
}
|
||||
__brzl_compileOutlinks(window).join('\n');
|
||||
'''
|
||||
def extract_outlinks(self, timeout=60):
|
||||
self.logger.info('extracting outlinks')
|
||||
self.websock_thread.expect_result(self._command_id.peek())
|
||||
js = brozzler.jinja2_environment().get_template(
|
||||
'extract-outlinks.js').render()
|
||||
msg_id = self.send_to_chrome(
|
||||
method='Runtime.evaluate',
|
||||
params={'expression': self.OUTLINKS_JS})
|
||||
method='Runtime.evaluate', params={'expression': js})
|
||||
self._wait_for(
|
||||
lambda: self.websock_thread.received_result(msg_id),
|
||||
timeout=timeout)
|
||||
@ -524,64 +507,9 @@ __brzl_compileOutlinks(window).join('\n');
|
||||
except BrowsingTimeout:
|
||||
pass
|
||||
|
||||
TRY_LOGIN_JS_J2 = '''
|
||||
var __brzl_tryLoginState = 'trying';
|
||||
|
||||
var __brzl_tryLogin = function() {
|
||||
for (var i = 0; i < document.forms.length; i++) {
|
||||
var form = document.forms[i];
|
||||
if (form.method != 'post') {
|
||||
continue;
|
||||
}
|
||||
var usernameField, passwordField;
|
||||
for (var j = 0; j < form.elements.length; j++) {
|
||||
var field = form.elements[j];
|
||||
if (field.type == 'text' || field.type == 'email') {
|
||||
if (!usernameField) {
|
||||
usernameField = field;
|
||||
} else {
|
||||
usernameField = undefined;
|
||||
break;
|
||||
}
|
||||
} else if (field.type == 'password') {
|
||||
if (!passwordField) {
|
||||
passwordField = field;
|
||||
} else {
|
||||
passwordField = undefined;
|
||||
break;
|
||||
}
|
||||
} else if (field.type == 'textarea') {
|
||||
usernameField = undefined;
|
||||
passwordField = undefined;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (usernameField && passwordField) {
|
||||
usernameField.value = {{username|json}};
|
||||
passwordField.value = {{password|json}};
|
||||
console.log('submitting username=' + usernameField.value
|
||||
+ ' password=*** to detected login form');
|
||||
try {
|
||||
form.submit();
|
||||
} catch (e) {
|
||||
// "If a form control (such as a submit button) has a name or
|
||||
// id of 'submit' it will mask the form's submit method." -MDN
|
||||
// http://stackoverflow.com/a/2000021
|
||||
var pseudoForm = document.createElement('form');
|
||||
pseudoForm.submit.apply(form);
|
||||
}
|
||||
__brzl_tryLoginState = 'submitted-form';
|
||||
return;
|
||||
}
|
||||
}
|
||||
__brzl_tryLoginState = 'login-form-not-found';
|
||||
};
|
||||
|
||||
__brzl_tryLogin();
|
||||
'''
|
||||
def try_login(self, username, password, timeout=300):
|
||||
try_login_js = brozzler.jinja2_environment().from_string(
|
||||
self.TRY_LOGIN_JS_J2).render(
|
||||
try_login_js = brozzler.jinja2_environment().get_template(
|
||||
'try-login.js.j2').render(
|
||||
username=username, password=password)
|
||||
|
||||
self.websock_thread.got_page_load_event = None
|
||||
|
16
brozzler/js-templates/extract-outlinks.js
Normal file
16
brozzler/js-templates/extract-outlinks.js
Normal file
@ -0,0 +1,16 @@
|
||||
var __brzl_framesDone = new Set();
|
||||
var __brzl_compileOutlinks = function(frame) {
|
||||
__brzl_framesDone.add(frame);
|
||||
if (frame && frame.document) {
|
||||
var outlinks = Array.prototype.slice.call(
|
||||
frame.document.querySelectorAll('a[href]'));
|
||||
for (var i = 0; i < frame.frames.length; i++) {
|
||||
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
|
||||
outlinks = outlinks.concat(
|
||||
__brzl_compileOutlinks(frame.frames[i]));
|
||||
}
|
||||
}
|
||||
}
|
||||
return outlinks;
|
||||
}
|
||||
__brzl_compileOutlinks(window).join('\n');
|
53
brozzler/js-templates/try-login.js.j2
Normal file
53
brozzler/js-templates/try-login.js.j2
Normal file
@ -0,0 +1,53 @@
|
||||
var __brzl_tryLoginState = 'trying';
|
||||
|
||||
var __brzl_tryLogin = function() {
|
||||
for (var i = 0; i < document.forms.length; i++) {
|
||||
var form = document.forms[i];
|
||||
if (form.method != 'post') {
|
||||
continue;
|
||||
}
|
||||
var usernameField, passwordField;
|
||||
for (var j = 0; j < form.elements.length; j++) {
|
||||
var field = form.elements[j];
|
||||
if (field.type == 'text' || field.type == 'email') {
|
||||
if (!usernameField) {
|
||||
usernameField = field;
|
||||
} else {
|
||||
usernameField = undefined;
|
||||
break;
|
||||
}
|
||||
} else if (field.type == 'password') {
|
||||
if (!passwordField) {
|
||||
passwordField = field;
|
||||
} else {
|
||||
passwordField = undefined;
|
||||
break;
|
||||
}
|
||||
} else if (field.type == 'textarea') {
|
||||
usernameField = undefined;
|
||||
passwordField = undefined;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (usernameField && passwordField) {
|
||||
usernameField.value = {{username|json}};
|
||||
passwordField.value = {{password|json}};
|
||||
console.log('submitting username=' + usernameField.value
|
||||
+ ' password=*** to detected login form');
|
||||
try {
|
||||
form.submit();
|
||||
} catch (e) {
|
||||
// "If a form control (such as a submit button) has a name or
|
||||
// id of 'submit' it will mask the form's submit method." -MDN
|
||||
// http://stackoverflow.com/a/2000021
|
||||
var pseudoForm = document.createElement('form');
|
||||
pseudoForm.submit.apply(form);
|
||||
}
|
||||
__brzl_tryLoginState = 'submitted-form';
|
||||
return;
|
||||
}
|
||||
}
|
||||
__brzl_tryLoginState = 'login-form-not-found';
|
||||
};
|
||||
|
||||
__brzl_tryLogin();
|
5
setup.py
5
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev148',
|
||||
version='1.1b9.dev149',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
@ -41,7 +41,8 @@ setuptools.setup(
|
||||
license='Apache License 2.0',
|
||||
packages=['brozzler', 'brozzler.dashboard'],
|
||||
package_data={
|
||||
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
|
||||
'brozzler': [
|
||||
'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
|
||||
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
|
||||
},
|
||||
entry_points={
|
||||
|
Loading…
x
Reference in New Issue
Block a user