diff --git a/brozzler/behaviors.d/simpledo.js.template b/brozzler/behaviors.d/simpledo.js.template new file mode 100644 index 0000000..cd0216d --- /dev/null +++ b/brozzler/behaviors.d/simpledo.js.template @@ -0,0 +1,140 @@ +/* + * brozzler/behaviors.d/simpledo.js.in - simpledo behavior template, + * acting on elements matching templatized css selector, + * based on simpleclicks.js.template and mouseovers.js.template + * + * Copyright (C) 2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var umbraBehavior = { + IDLE_TIMEOUT_SEC : 10, + idleSince : null, + + intervalFunc : function() { + var didSomething = false; + var somethingLeftBelow = false; + var somethingLeftAbove = false; + var cssSelector = "${sdo_css_selector}"; + var doAction = "${sdo_action}"; // currently supports click, mouseover + var doUntilTimeout = "${sdo_until_hard_timeout}"; + + //handle Python to JavaScript boolean conversion + doUntilTimeout == "True" ? doUntilTimeout = true : doUntilTimeout = false; + + var iframes = document.querySelectorAll("iframe"); + var documents = Array(iframes.length + 1); + documents[0] = document; + + for (var i = 0; i < iframes.length; i++) { + documents[i+1] = iframes[i].contentWindow.document; + } + + for (var j = 0; j < documents.length; j++) { + + var doTargets = documents[j].querySelectorAll(cssSelector); + + for ( var i = 0; i < doTargets.length; i++) { + if (doTargets[i].umbraDone && !doUntilTimeout) { + continue; + } + + var where = this.aboveBelowOrOnScreen(doTargets[i]); + + if (where == 0) { + console.log("doing " + doAction + doTargets[i].outerHTML); + // do mouse over event on target + // since some urls are requsted only on + // this event - see + // https://webarchive.jira.com/browse/AITFIVE-451 + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent('mouseover',true, false); + doTargets[i].dispatchEvent(mouseOverEvent); + + if (doAction == 'click') { + doTargets[i].click(); + } // add new do's here! + + didSomething = true; + this.idleSince = null; + doTargets[i].umbraDone = true; + + break; // break from doTargets loop (not from iframe loop) + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + } + + if (!didSomething) { + if (somethingLeftAbove) { + // console.log("scrolling UP because everything on this screen has been done but we missed something above"); + window.scrollBy(0, -500); + this.idleSince = null; + } else if (somethingLeftBelow) { + // console.log("scrolling because everything on this screen has been done but there's more below"); + window.scrollBy(0, 200); + this.idleSince = null; + } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + // console.log("scrolling because we're not to the bottom yet"); + window.scrollBy(0, 200); + this.idleSince = null; + } else if (this.idleSince == null) { + this.idleSince = Date.now(); + } + } + + if (!this.idleSince) { + this.idleSince = Date.now(); + } + }, + + start : function() { + var that = this; + this.intervalId = setInterval(function() { + that.intervalFunc() + }, 250); + }, + + isFinished : function() { + if (this.idleSince != null) { + var idleTimeMs = Date.now() - this.idleSince; + if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) { + clearInterval(this.intervalId); + return true; + } + } + return false; + }, + + aboveBelowOrOnScreen : function(e) { + var eTop = e.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + return 1; // below + } else { + return 0; // on screen + } + }, +}; + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + return umbraBehavior.isFinished() +}; + +umbraBehavior.start(); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 1130bcc..3f57bfa 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -1,7 +1,7 @@ # # brozzler/behaviors.yaml - behavior configuration # -# Copyright (C) 2014-2016 Internet Archive +# Copyright (C) 2014-2017 Internet Archive # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,86 +17,85 @@ # # first matched behavior is used, so order matters here -- - url_regex: '^https?://(?:www\.)?facebook\.com/.*$' - behavior_js_template: facebook.js - request_idle_timeout_sec: 30 -- - url_regex: '^https?://(?:www\.)?marquette\.edu/.*$' - behavior_js_template: marquette_edu.js - request_idle_timeout_sec: 10 -- - url_regex: '^https?://(?:www\.)?vimeo\.com/.*$' - behavior_js_template: vimeo.js - request_idle_timeout_sec: 10 -- - url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$' - behavior_js_template: psu24.js - request_idle_timeout_sec: 10 -- - url_regex: '^https?://(?:www\.)?instagram\.com/.*$' - behavior_js_template: instagram.js - request_idle_timeout_sec: 10 -- - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: img.img-responsive - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # acalog https://webarchive.jira.com/browse/ARI-3775 - url_regex: '^https?://.*[?&]catoid=[^?]*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: a[onclick] - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-3956 - url_regex: '^https?://(?:www\.)?usask.ca/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: a[id='feature-next'] - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/AITFIVE-451 - url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: button.sc-button-play, button.playButton - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/AITFIVE-463 - url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: button.playButton.medium - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-4690 - url_regex: '^https?://(?:www\.)?youtube.com/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: span.load-more-text - click_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-4725 - url_regex: '^https?://(?:www\.)?moma.org/.*$' - behavior_js_template: simpleclicks.js.j2 - default_parameters: - click_css_selector: button[data-more-results-bottom-button] - click_until_hard_timeout: True - request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-4692 - url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' - behavior_js_template: fec_gov.js - request_idle_timeout_sec: 10 -- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' - behavior_js_template: mouseovers.js.j2 - default_parameters: - mouseover_css_selector: .menu-item a - mouseover_until_hard_timeout: False - request_idle_timeout_sec: 10 -- # default fallback behavior - url_regex: '^.*$' - request_idle_timeout_sec: 10 - behavior_js_template: default.js +behaviors: + - + url_regex: '^https?://(?:www\.)?facebook\.com/.*$' + behavior_js_template: facebook.js.template + # default_parameters: + # parameter_username: jdoe@example.com + # parameter_password: abcd1234 + request_idle_timeout_sec: 30 + - + url_regex: '^https?://(?:www\.)?marquette\.edu/.*$' + behavior_js: marquette_edu.js + request_idle_timeout_sec: 10 + - + url_regex: '^https?://(?:www\.)?vimeo\.com/.*$' + behavior_js: vimeo.js + request_idle_timeout_sec: 10 + - + url_regex: '^https?://(?:www\.)?psu24.psu.edu/.*$' + behavior_js: psu24.js + request_idle_timeout_sec: 10 + - + url_regex: '^https?://(?:www\.)?instagram\.com/.*$' + behavior_js: instagram.js + request_idle_timeout_sec: 10 + - + url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: img.img-responsive + request_idle_timeout_sec: 10 + - # acalog https://webarchive.jira.com/browse/ARI-3775 + url_regex: '^https?://.*[?&]catoid=[^?]*$' + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: a[onclick] + request_idle_timeout_sec: 10 + - # https://webarchive.jira.com/browse/ARI-3956 + url_regex: '^https?://(?:www\.)?usask.ca/.*$' + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: a[id='feature-next'] + request_idle_timeout_sec: 10 + - # https://webarchive.jira.com/browse/AITFIVE-451 + url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' + behavior_js_template: simpledo.js.template + default_parameters: + sdo_css_selector: button.sc-button-play, button.playButton + sdo_action: click + request_idle_timeout_sec: 10 + - # https://webarchive.jira.com/browse/AITFIVE-463 + url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button.playButton.medium + request_idle_timeout_sec: 10 + - # https://webarchive.jira.com/browse/ARI-4690 + url_regex: '^https?://(?:www\.)?youtube.com/.*$' + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: span.load-more-text + request_idle_timeout_sec: 10 + - # https://webarchive.jira.com/browse/ARI-4725 + url_regex: '^https?://(?:www\.)?moma.org/.*$' + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button[data-more-results-bottom-button] + click_until_hard_timeout: True + request_idle_timeout_sec: 10 + - # https://webarchive.jira.com/browse/ARI-4692 + url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' + behavior_js: fec_gov.js + request_idle_timeout_sec: 10 + - url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' + behavior_js_template: simpledo.js.template + default_parameters: + sdo_css_selector: .menu-item a + sdo_action: mouseover + request_idle_timeout_sec: 10 + - # default fallback behavior + url_regex: '^.*$' + request_idle_timeout_sec: 10 + behavior_js: default.js