diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 46a0704..6853c40 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -86,23 +86,7 @@ def behaviors(): behaviors_yaml = os.path.join( os.path.dirname(__file__), 'behaviors.yaml') with open(behaviors_yaml) as fin: - conf = yaml.load(fin) - _behaviors = conf['behaviors'] - - for behavior in _behaviors: - if 'behavior_js' in behavior: - behavior_js = os.path.join( - os.path.dirname(__file__), 'behaviors.d', - behavior['behavior_js']) - with open(behavior_js, encoding='utf-8') as fin: - behavior['script'] = fin.read() - elif 'behavior_js_template' in behavior: - behavior_js_template = os.path.join( - os.path.dirname(__file__), 'behaviors.d', - behavior['behavior_js_template']) - with open(behavior_js_template, encoding='utf-8') as fin: - behavior['template'] = string.Template(fin.read()) - + _behaviors = yaml.load(fin) return _behaviors def behavior_script(url, template_parameters=None): @@ -112,22 +96,18 @@ def behavior_script(url, template_parameters=None): import re, logging for behavior in behaviors(): if re.match(behavior['url_regex'], url): - if 'behavior_js' in behavior: - logging.info( - 'using behavior %s for %s', - behavior['behavior_js'], url) - return behavior['script'] - elif 'behavior_js_template' in behavior: - parameters = dict() - if 'default_parameters' in behavior: - parameters.update(behavior['default_parameters']) - if template_parameters: - parameters.update(template_parameters) - script = behavior['template'].safe_substitute(parameters) - logging.info( - 'using template=%s populated with parameters=%s for %s', - repr(behavior['behavior_js_template']), parameters, url) - return script + parameters = dict() + if 'default_parameters' in behavior: + parameters.update(behavior['default_parameters']) + if template_parameters: + parameters.update(template_parameters) + template = jinja2_environment().get_template( + behavior['behavior_js_template']) + script = template.render(parameters) + logging.info( + 'using template=%s populated with parameters=%s for %s', + repr(behavior['behavior_js_template']), parameters, url) + return script return None def thread_raise(thread, exctype): diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index c9697c0..8a7dab2 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -17,83 +17,82 @@ # # first matched behavior is used, so order matters here -behaviors: - - - url_regex: '^https?://(?:www\.)?facebook\.com/.*$' - behavior_js_template: facebook.js.template - # default_parameters: - # parameter_username: jdoe@example.com - # parameter_password: abcd1234 - request_idle_timeout_sec: 30 - - - url_regex: '^https?://(?:www\.)?marquette\.edu/.*$' - behavior_js: marquette_edu.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?vimeo\.com/.*$' - behavior_js: vimeo.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$' - behavior_js: psu24.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?instagram\.com/.*$' - behavior_js: instagram.js - request_idle_timeout_sec: 10 - - - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: img.img-responsive - request_idle_timeout_sec: 10 - - # acalog https://webarchive.jira.com/browse/ARI-3775 - url_regex: '^https?://.*[?&]catoid=[^?]*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: a[onclick] - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-3956 - url_regex: '^https?://(?:www\.)?usask.ca/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: a[id='feature-next'] - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/AITFIVE-451 - url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: button.sc-button-play, button.playButton - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/AITFIVE-463 - url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: button.playButton.medium - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-4690 - url_regex: '^https?://(?:www\.)?youtube.com/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: span.load-more-text - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-4725 - url_regex: '^https?://(?:www\.)?moma.org/.*$' - behavior_js_template: simpleclicks.js.template - default_parameters: - click_css_selector: button[data-more-results-bottom-button] - click_until_hard_timeout: True - request_idle_timeout_sec: 10 - - # https://webarchive.jira.com/browse/ARI-4692 - url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' - behavior_js: fec_gov.js - request_idle_timeout_sec: 10 - - url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' - behavior_js_template: mouseovers.js.template - default_parameters: - mouseover_css_selector: .menu-item a - request_idle_timeout_sec: 10 - - # default fallback behavior - url_regex: '^.*$' - request_idle_timeout_sec: 10 - behavior_js: default.js +- + url_regex: '^https?://(?:www\.)?facebook\.com/.*$' + behavior_js_template: facebook.js.j2 + # default_parameters: + # parameter_username: jdoe@example.com + # parameter_password: abcd1234 + request_idle_timeout_sec: 30 +- + url_regex: '^https?://(?:www\.)?marquette\.edu/.*$' + behavior_js_template: marquette_edu.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?vimeo\.com/.*$' + behavior_js_template: vimeo.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$' + behavior_js_template: psu24.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?instagram\.com/.*$' + behavior_js_template: instagram.js + request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: img.img-responsive + request_idle_timeout_sec: 10 +- # acalog https://webarchive.jira.com/browse/ARI-3775 + url_regex: '^https?://.*[?&]catoid=[^?]*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: a[onclick] + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-3956 + url_regex: '^https?://(?:www\.)?usask.ca/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: a[id='feature-next'] + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/AITFIVE-451 + url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: button.sc-button-play, button.playButton + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/AITFIVE-463 + url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: button.playButton.medium + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-4690 + url_regex: '^https?://(?:www\.)?youtube.com/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: span.load-more-text + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-4725 + url_regex: '^https?://(?:www\.)?moma.org/.*$' + behavior_js_template: simpleclicks.js.j2 + default_parameters: + click_css_selector: button[data-more-results-bottom-button] + click_until_hard_timeout: True + request_idle_timeout_sec: 10 +- # https://webarchive.jira.com/browse/ARI-4692 + url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' + behavior_js_template: fec_gov.js + request_idle_timeout_sec: 10 +- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' + behavior_js_template: mouseovers.js.j2 + default_parameters: + mouseover_css_selector: .menu-item a + request_idle_timeout_sec: 10 +- # default fallback behavior + url_regex: '^.*$' + request_idle_timeout_sec: 10 + behavior_js_template: default.js diff --git a/brozzler/browser.py b/brozzler/browser.py index 6f2058b..5595e7a 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -420,30 +420,13 @@ class Browser: lambda: self.websock_thread.got_page_load_event, timeout=timeout) - OUTLINKS_JS = r''' -var __brzl_framesDone = new Set(); -var __brzl_compileOutlinks = function(frame) { - __brzl_framesDone.add(frame); - if (frame && frame.document) { - var outlinks = Array.prototype.slice.call( - frame.document.querySelectorAll('a[href]')); - for (var i = 0; i < frame.frames.length; i++) { - if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { - outlinks = outlinks.concat( - __brzl_compileOutlinks(frame.frames[i])); - } - } - } - return outlinks; -} -__brzl_compileOutlinks(window).join('\n'); -''' def extract_outlinks(self, timeout=60): self.logger.info('extracting outlinks') self.websock_thread.expect_result(self._command_id.peek()) + js = brozzler.jinja2_environment().get_template( + 'extract-outlinks.js').render() msg_id = self.send_to_chrome( - method='Runtime.evaluate', - params={'expression': self.OUTLINKS_JS}) + method='Runtime.evaluate', params={'expression': js}) self._wait_for( lambda: self.websock_thread.received_result(msg_id), timeout=timeout) @@ -524,64 +507,9 @@ __brzl_compileOutlinks(window).join('\n'); except BrowsingTimeout: pass - TRY_LOGIN_JS_J2 = ''' -var __brzl_tryLoginState = 'trying'; - -var __brzl_tryLogin = function() { - for (var i = 0; i < document.forms.length; i++) { - var form = document.forms[i]; - if (form.method != 'post') { - continue; - } - var usernameField, passwordField; - for (var j = 0; j < form.elements.length; j++) { - var field = form.elements[j]; - if (field.type == 'text' || field.type == 'email') { - if (!usernameField) { - usernameField = field; - } else { - usernameField = undefined; - break; - } - } else if (field.type == 'password') { - if (!passwordField) { - passwordField = field; - } else { - passwordField = undefined; - break; - } - } else if (field.type == 'textarea') { - usernameField = undefined; - passwordField = undefined; - break; - } - } - if (usernameField && passwordField) { - usernameField.value = {{username|json}}; - passwordField.value = {{password|json}}; - console.log('submitting username=' + usernameField.value - + ' password=*** to detected login form'); - try { - form.submit(); - } catch (e) { - // "If a form control (such as a submit button) has a name or - // id of 'submit' it will mask the form's submit method." -MDN - // http://stackoverflow.com/a/2000021 - var pseudoForm = document.createElement('form'); - pseudoForm.submit.apply(form); - } - __brzl_tryLoginState = 'submitted-form'; - return; - } - } - __brzl_tryLoginState = 'login-form-not-found'; -}; - -__brzl_tryLogin(); -''' def try_login(self, username, password, timeout=300): - try_login_js = brozzler.jinja2_environment().from_string( - self.TRY_LOGIN_JS_J2).render( + try_login_js = brozzler.jinja2_environment().get_template( + 'try-login.js.j2').render( username=username, password=password) self.websock_thread.got_page_load_event = None diff --git a/brozzler/behaviors.d/default.js b/brozzler/js-templates/default.js similarity index 100% rename from brozzler/behaviors.d/default.js rename to brozzler/js-templates/default.js diff --git a/brozzler/js-templates/extract-outlinks.js b/brozzler/js-templates/extract-outlinks.js new file mode 100644 index 0000000..3be0dfc --- /dev/null +++ b/brozzler/js-templates/extract-outlinks.js @@ -0,0 +1,16 @@ +var __brzl_framesDone = new Set(); +var __brzl_compileOutlinks = function(frame) { + __brzl_framesDone.add(frame); + if (frame && frame.document) { + var outlinks = Array.prototype.slice.call( + frame.document.querySelectorAll('a[href]')); + for (var i = 0; i < frame.frames.length; i++) { + if (frame.frames[i] && !__brzl_framesDone.has(frame.frames[i])) { + outlinks = outlinks.concat( + __brzl_compileOutlinks(frame.frames[i])); + } + } + } + return outlinks; +} +__brzl_compileOutlinks(window).join('\n'); diff --git a/brozzler/behaviors.d/facebook.js.template b/brozzler/js-templates/facebook.js.j2 similarity index 100% rename from brozzler/behaviors.d/facebook.js.template rename to brozzler/js-templates/facebook.js.j2 diff --git a/brozzler/behaviors.d/fec_gov.js b/brozzler/js-templates/fec_gov.js similarity index 100% rename from brozzler/behaviors.d/fec_gov.js rename to brozzler/js-templates/fec_gov.js diff --git a/brozzler/behaviors.d/instagram.js b/brozzler/js-templates/instagram.js similarity index 100% rename from brozzler/behaviors.d/instagram.js rename to brozzler/js-templates/instagram.js diff --git a/brozzler/behaviors.d/marquette_edu.js b/brozzler/js-templates/marquette_edu.js similarity index 100% rename from brozzler/behaviors.d/marquette_edu.js rename to brozzler/js-templates/marquette_edu.js diff --git a/brozzler/behaviors.d/mouseovers.js.template b/brozzler/js-templates/mouseovers.js.j2 similarity index 100% rename from brozzler/behaviors.d/mouseovers.js.template rename to brozzler/js-templates/mouseovers.js.j2 diff --git a/brozzler/behaviors.d/psu24.js b/brozzler/js-templates/psu24.js similarity index 100% rename from brozzler/behaviors.d/psu24.js rename to brozzler/js-templates/psu24.js diff --git a/brozzler/behaviors.d/simpleclicks.js.template b/brozzler/js-templates/simpleclicks.js.j2 similarity index 100% rename from brozzler/behaviors.d/simpleclicks.js.template rename to brozzler/js-templates/simpleclicks.js.j2 diff --git a/brozzler/js-templates/try-login.js.j2 b/brozzler/js-templates/try-login.js.j2 new file mode 100644 index 0000000..e6bbfa3 --- /dev/null +++ b/brozzler/js-templates/try-login.js.j2 @@ -0,0 +1,53 @@ +var __brzl_tryLoginState = 'trying'; + +var __brzl_tryLogin = function() { + for (var i = 0; i < document.forms.length; i++) { + var form = document.forms[i]; + if (form.method != 'post') { + continue; + } + var usernameField, passwordField; + for (var j = 0; j < form.elements.length; j++) { + var field = form.elements[j]; + if (field.type == 'text' || field.type == 'email') { + if (!usernameField) { + usernameField = field; + } else { + usernameField = undefined; + break; + } + } else if (field.type == 'password') { + if (!passwordField) { + passwordField = field; + } else { + passwordField = undefined; + break; + } + } else if (field.type == 'textarea') { + usernameField = undefined; + passwordField = undefined; + break; + } + } + if (usernameField && passwordField) { + usernameField.value = {{username|json}}; + passwordField.value = {{password|json}}; + console.log('submitting username=' + usernameField.value + + ' password=*** to detected login form'); + try { + form.submit(); + } catch (e) { + // "If a form control (such as a submit button) has a name or + // id of 'submit' it will mask the form's submit method." -MDN + // http://stackoverflow.com/a/2000021 + var pseudoForm = document.createElement('form'); + pseudoForm.submit.apply(form); + } + __brzl_tryLoginState = 'submitted-form'; + return; + } + } + __brzl_tryLoginState = 'login-form-not-found'; +}; + +__brzl_tryLogin(); diff --git a/brozzler/behaviors.d/vimeo.js b/brozzler/js-templates/vimeo.js similarity index 100% rename from brozzler/behaviors.d/vimeo.js rename to brozzler/js-templates/vimeo.js diff --git a/setup.py b/setup.py index f009fe0..2df83d2 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev148', + version='1.1b9.dev149', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', @@ -41,7 +41,8 @@ setuptools.setup( license='Apache License 2.0', packages=['brozzler', 'brozzler.dashboard'], package_data={ - 'brozzler': ['behaviors.d/*.js*', 'behaviors.yaml', 'job_schema.yaml'], + 'brozzler': [ + 'js-templates/*.js*', 'behaviors.yaml', 'job_schema.yaml'], 'brozzler.dashboard': find_package_data('brozzler.dashboard'), }, entry_points={