convert behaviors to jinja2, move them to new subdir js-templates, along with javascript previously stored as a string in browser.py

This commit is contained in:
Noah Levitt 2016-12-20 16:33:25 -08:00
parent 06fd0a0d79
commit a0b61408b9
15 changed files with 169 additions and 192 deletions

View File

@ -86,23 +86,7 @@ def behaviors():
behaviors_yaml = os.path.join(
os.path.dirname(__file__), 'behaviors.yaml')
with open(behaviors_yaml) as fin:
conf = yaml.load(fin)
_behaviors = conf['behaviors']
for behavior in _behaviors:
if 'behavior_js' in behavior:
behavior_js = os.path.join(
os.path.dirname(__file__), 'behaviors.d',
behavior['behavior_js'])
with open(behavior_js, encoding='utf-8') as fin:
behavior['script'] = fin.read()
elif 'behavior_js_template' in behavior:
behavior_js_template = os.path.join(
os.path.dirname(__file__), 'behaviors.d',
behavior['behavior_js_template'])
with open(behavior_js_template, encoding='utf-8') as fin:
behavior['template'] = string.Template(fin.read())
_behaviors = yaml.load(fin)
return _behaviors
def behavior_script(url, template_parameters=None):
@ -112,22 +96,18 @@ def behavior_script(url, template_parameters=None):
import re, logging
for behavior in behaviors():
if re.match(behavior['url_regex'], url):
if 'behavior_js' in behavior:
logging.info(
'using behavior %s for %s',
behavior['behavior_js'], url)
return behavior['script']
elif 'behavior_js_template' in behavior:
parameters = dict()
if 'default_parameters' in behavior:
parameters.update(behavior['default_parameters'])
if template_parameters:
parameters.update(template_parameters)
script = behavior['template'].safe_substitute(parameters)
logging.info(
'using template=%s populated with parameters=%s for %s',
repr(behavior['behavior_js_template']), parameters, url)
return script
parameters = dict()
if 'default_parameters' in behavior:
parameters.update(behavior['default_parameters'])
if template_parameters:
parameters.update(template_parameters)
template = jinja2_environment().get_template(
behavior['behavior_js_template'])
script = template.render(parameters)
logging.info(
'using template=%s populated with parameters=%s for %s',
repr(behavior['behavior_js_template']), parameters, url)
return script
return None
def thread_raise(thread, exctype):

View File

@ -17,83 +17,82 @@
#
# first matched behavior is used, so order matters here
behaviors:
-
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
behavior_js_template: facebook.js.template
# default_parameters:
# parameter_username: jdoe@example.com
# parameter_password: abcd1234
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
behavior_js: marquette_edu.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?vimeo\.com/.*$'
behavior_js: vimeo.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
behavior_js: psu24.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js: instagram.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: img.img-responsive
request_idle_timeout_sec: 10
- # acalog https://webarchive.jira.com/browse/ARI-3775
url_regex: '^https?://.*[?&]catoid=[^?]*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[onclick]
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-3956
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[id='feature-next']
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-451
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.sc-button-play, button.playButton
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-463
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.playButton.medium
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: span.load-more-text
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4725
url_regex: '^https?://(?:www\.)?moma.org/.*$'
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4692
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
behavior_js: fec_gov.js
request_idle_timeout_sec: 10
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
behavior_js_template: mouseovers.js.template
default_parameters:
mouseover_css_selector: .menu-item a
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10
behavior_js: default.js
-
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
behavior_js_template: facebook.js.j2
# default_parameters:
# parameter_username: jdoe@example.com
# parameter_password: abcd1234
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?marquette\.edu/.*$'
behavior_js_template: marquette_edu.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?vimeo\.com/.*$'
behavior_js_template: vimeo.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$'
behavior_js_template: psu24.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js_template: instagram.js
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: img.img-responsive
request_idle_timeout_sec: 10
- # acalog https://webarchive.jira.com/browse/ARI-3775
url_regex: '^https?://.*[?&]catoid=[^?]*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: a[onclick]
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-3956
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: a[id='feature-next']
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-451
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: button.sc-button-play, button.playButton
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-463
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: button.playButton.medium
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: span.load-more-text
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4725
url_regex: '^https?://(?:www\.)?moma.org/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4692
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
behavior_js_template: fec_gov.js
request_idle_timeout_sec: 10
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
behavior_js_template: mouseovers.js.j2
default_parameters:
mouseover_css_selector: .menu-item a
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10
behavior_js_template: default.js

View File

@ -420,30 +420,13 @@ class Browser:
lambda: self.websock_thread.got_page_load_event,
timeout=timeout)
OUTLINKS_JS = r'''
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame);
if (frame && frame.document) {
var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
outlinks = outlinks.concat(
__brzl_compileOutlinks(frame.frames[i]));
}
}
}
return outlinks;
}
__brzl_compileOutlinks(window).join('\n');
'''
def extract_outlinks(self, timeout=60):
self.logger.info('extracting outlinks')
self.websock_thread.expect_result(self._command_id.peek())
js = brozzler.jinja2_environment().get_template(
'extract-outlinks.js').render()
msg_id = self.send_to_chrome(
method='Runtime.evaluate',
params={'expression': self.OUTLINKS_JS})
method='Runtime.evaluate', params={'expression': js})
self._wait_for(
lambda: self.websock_thread.received_result(msg_id),
timeout=timeout)
@ -524,64 +507,9 @@ __brzl_compileOutlinks(window).join('\n');
except BrowsingTimeout:
pass
TRY_LOGIN_JS_J2 = '''
var __brzl_tryLoginState = 'trying';
var __brzl_tryLogin = function() {
for (var i = 0; i < document.forms.length; i++) {
var form = document.forms[i];
if (form.method != 'post') {
continue;
}
var usernameField, passwordField;
for (var j = 0; j < form.elements.length; j++) {
var field = form.elements[j];
if (field.type == 'text' || field.type == 'email') {
if (!usernameField) {
usernameField = field;
} else {
usernameField = undefined;
break;
}
} else if (field.type == 'password') {
if (!passwordField) {
passwordField = field;
} else {
passwordField = undefined;
break;
}
} else if (field.type == 'textarea') {
usernameField = undefined;
passwordField = undefined;
break;
}
}
if (usernameField && passwordField) {
usernameField.value = {{username|json}};
passwordField.value = {{password|json}};
console.log('submitting username=' + usernameField.value
+ ' password=*** to detected login form');
try {
form.submit();
} catch (e) {
// "If a form control (such as a submit button) has a name or
// id of 'submit' it will mask the form's submit method." -MDN
// http://stackoverflow.com/a/2000021
var pseudoForm = document.createElement('form');
pseudoForm.submit.apply(form);
}
__brzl_tryLoginState = 'submitted-form';
return;
}
}
__brzl_tryLoginState = 'login-form-not-found';
};
__brzl_tryLogin();
'''
def try_login(self, username, password, timeout=300):
try_login_js = brozzler.jinja2_environment().from_string(
self.TRY_LOGIN_JS_J2).render(
try_login_js = brozzler.jinja2_environment().get_template(
'try-login.js.j2').render(
username=username, password=password)
self.websock_thread.got_page_load_event = None

View File

@ -0,0 +1,16 @@
var __brzl_framesDone = new Set();
var __brzl_compileOutlinks = function(frame) {
__brzl_framesDone.add(frame);
if (frame && frame.document) {
var outlinks = Array.prototype.slice.call(
frame.document.querySelectorAll('a[href]'));
for (var i = 0; i < frame.frames.length; i++) {
if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) {
outlinks = outlinks.concat(
__brzl_compileOutlinks(frame.frames[i]));
}
}
}
return outlinks;
}
__brzl_compileOutlinks(window).join('\n');

View File

@ -0,0 +1,53 @@
var __brzl_tryLoginState = 'trying';
var __brzl_tryLogin = function() {
for (var i = 0; i < document.forms.length; i++) {
var form = document.forms[i];
if (form.method != 'post') {
continue;
}
var usernameField, passwordField;
for (var j = 0; j < form.elements.length; j++) {
var field = form.elements[j];
if (field.type == 'text' || field.type == 'email') {
if (!usernameField) {
usernameField = field;
} else {
usernameField = undefined;
break;
}
} else if (field.type == 'password') {
if (!passwordField) {
passwordField = field;
} else {
passwordField = undefined;
break;
}
} else if (field.type == 'textarea') {
usernameField = undefined;
passwordField = undefined;
break;
}
}
if (usernameField && passwordField) {
usernameField.value = {{username|json}};
passwordField.value = {{password|json}};
console.log('submitting username=' + usernameField.value
+ ' password=*** to detected login form');
try {
form.submit();
} catch (e) {
// "If a form control (such as a submit button) has a name or
// id of 'submit' it will mask the form's submit method." -MDN
// http://stackoverflow.com/a/2000021
var pseudoForm = document.createElement('form');
pseudoForm.submit.apply(form);
}
__brzl_tryLoginState = 'submitted-form';
return;
}
}
__brzl_tryLoginState = 'login-form-not-found';
};
__brzl_tryLogin();

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev148',
version='1.1b9.dev149',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',
@ -41,7 +41,8 @@ setuptools.setup(
license='Apache License 2.0',
packages=['brozzler', 'brozzler.dashboard'],
package_data={
'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
'brozzler': [
'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'],
'brozzler.dashboard': find_package_data('brozzler.dashboard'),
},
entry_points={