diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 1c410ab..ae58cf5 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -68,29 +68,34 @@ logging._levelToName[TRACE] = 'TRACE' logging._nameToLevel['TRACE'] = TRACE _behaviors = None -def behaviors(): +def behaviors(behaviors_dir=None): + """Return list of JS behaviors loaded from YAML file. + + :param behaviors_dir: Directory containing `behaviors.yaml` and + `js-templates/`. Defaults to brozzler dir. + """ import os, yaml, string global _behaviors if _behaviors is None: - behaviors_yaml = os.path.join( - os.path.dirname(__file__), 'behaviors.yaml') + d = behaviors_dir or os.path.dirname(__file__) + behaviors_yaml = os.path.join(d, 'behaviors.yaml') with open(behaviors_yaml) as fin: _behaviors = yaml.load(fin) return _behaviors -def behavior_script(url, template_parameters=None): +def behavior_script(url, template_parameters=None, behaviors_dir=None): ''' Returns the javascript behavior string populated with template_parameters. ''' import re, logging - for behavior in behaviors(): + for behavior in behaviors(behaviors_dir=behaviors_dir): if re.match(behavior['url_regex'], url): parameters = dict() if 'default_parameters' in behavior: parameters.update(behavior['default_parameters']) if template_parameters: parameters.update(template_parameters) - template = jinja2_environment().get_template( + template = jinja2_environment(behaviors_dir).get_template( behavior['behavior_js_template']) script = template.render(parameters) logging.info( @@ -229,12 +234,16 @@ def sleep(duration): time.sleep(min(duration - elapsed, 0.5)) _jinja2_env = None -def jinja2_environment(): +def jinja2_environment(behaviors_dir=None): global _jinja2_env if not _jinja2_env: - import jinja2, json - _jinja2_env = jinja2.Environment( - loader=jinja2.PackageLoader('brozzler', 'js-templates')) + import os, jinja2, json + if behaviors_dir: + _loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir, + 'js-templates')) + else: + _loader=jinja2.PackageLoader('brozzler', 'js-templates') + _jinja2_env = jinja2.Environment(loader=_loader) _jinja2_env.filters['json'] = json.dumps return _jinja2_env diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index a0ad600..77f35a6 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -1,7 +1,7 @@ # # brozzler/behaviors.yaml - behavior configuration # -# Copyright (C) 2014-2017 Internet Archive +# Copyright (C) 2014-2018 Internet Archive # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -57,6 +57,10 @@ click_css_selector: img.link-overlay click_until_hard_timeout: False request_idle_timeout_sec: 10 +- + url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' + behavior_js_template: huffpostslides.js + request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-5389 url_regex: '^https?://pitchfork\.com/.*$' behavior_js_template: umbraBehavior.js.j2 @@ -64,13 +68,6 @@ actions: - selector: div.teaser, li.pager__item a closeSelector: .pmf-artist-modal__close-btn -- - url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' - behavior_js_template: umbraBehavior.js.j2 - default_parameters: - actions: - - selector: .slideshow-card__overlay - - selector: .slideshow__next - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' behavior_js_template: simpleclicks.js.j2 @@ -157,13 +154,6 @@ actions: - selector: .menu-item a do: mouseover -- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' - behavior_js_template: mouseovers.js.j2 - default_parameters: - sdo_css_selector: .menu-item a - sdo_action: mouseover - sdo_until_hard_timeout: False - request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-5259 url_regex: '^https?://blog\.sina\.com\.cn/.*$' behavior_js_template: simpleclicks.js.j2 @@ -213,13 +203,6 @@ click_css_selector: img.link-overlay click_until_hard_timeout: False request_idle_timeout_sec: 10 -- # https://webarchive.jira.com/browse/ARI-5389 - url_regex: '^https?://pitchfork\.com/.*$' - behavior_js_template: pitchfork.js -- # https://webarchive.jira.com/browse/ARI-5379 - url_regex: '^https?://(?:www\.)?pm\.gc\.ca/.*$' - behavior_js_template: pm-ca.js - request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-4960 url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$' behavior_js_template: simpleclicks.js.j2 @@ -239,4 +222,4 @@ behavior_js_template: umbraBehavior.js.j2 default_parameters: actions: - - selector: button.sc-button-play, .playButton, div.soundItem, .jwlist>a, .ytp-button + - selector: button.sc-button-play, .playButton, div.soundItem, .jwlist>a diff --git a/brozzler/browser.py b/brozzler/browser.py index f9093cb..7ab51cd 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -382,7 +382,7 @@ class Browser: def browse_page( self, page_url, extra_headers=None, - user_agent=None, behavior_parameters=None, + user_agent=None, behavior_parameters=None, behaviors_dir=None, on_request=None, on_response=None, on_screenshot=None, username=None, password=None, hashtags=None, skip_extract_outlinks=False, skip_visit_hashtags=False, @@ -402,6 +402,8 @@ class Browser: supplied (default None) behavior_parameters: dict of parameters for populating the javascript behavior template (default None) + behaviors_dir: Directory containing behaviors.yaml and JS templates + (default None loads Brozzler default JS behaviors) on_request: callback to invoke on every Network.requestWillBeSent event, takes one argument, the json-decoded message (default None) @@ -452,7 +454,8 @@ class Browser: jpeg_bytes = self.screenshot() on_screenshot(jpeg_bytes) behavior_script = brozzler.behavior_script( - page_url, behavior_parameters) + page_url, behavior_parameters, + behaviors_dir=behaviors_dir) self.run_behavior(behavior_script, timeout=behavior_timeout) if skip_extract_outlinks: outlinks = [] diff --git a/brozzler/chrome.py b/brozzler/chrome.py index 23708d7..3b44773 100644 --- a/brozzler/chrome.py +++ b/brozzler/chrome.py @@ -126,6 +126,7 @@ class Chrome: '--remote-debugging-port=%s' % self.port, '--use-mock-keychain', # mac thing '--user-data-dir=%s' % self._chrome_user_data_dir, + '--disable-background-networking', '--disable-web-sockets', '--disable-cache', '--window-size=1100,900', '--no-default-browser-check', '--disable-first-run-ui', '--no-first-run', @@ -277,13 +278,13 @@ class Chrome: 'chrome pid %s reaped (status=%s) after killing with ' 'SIGKILL', self.chrome_process.pid, status) + finally: try: self._home_tmpdir.cleanup() except: self.logger.error( 'exception deleting %s', self._home_tmpdir, exc_info=True) - finally: self._out_reader_thread.join() self.chrome_process = None diff --git a/brozzler/js-templates/default.js b/brozzler/js-templates/default.js deleted file mode 100644 index a469828..0000000 --- a/brozzler/js-templates/default.js +++ /dev/null @@ -1,177 +0,0 @@ -/* - * brozzler/behaviors.d/default.js - default behavior, scrolls to the bottom of - * the page and clicks on selected embedded elements - * - * Copyright (C) 2014-2016 Internet Archive - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -var umbraAboveBelowOrOnScreen = function(e) { - var eTop = e.getBoundingClientRect().top; - if (eTop < window.scrollY) { - return -1; // above - } else if (eTop > window.scrollY + window.innerHeight) { - return 1; // below - } else { - return 0; // on screen - } -} - -var UMBRA_IFRAME_EMBEDDED_SELECTOR = "iframe"; -//elements selected for SoundCloud.com -var UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR = "button.sc-button-play, .playButton, div.soundItem"; -//elements selected for Archive.org Playlists -UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR += ", .jwlist>a" -var MAX_IFRAME_RECURSE_DEPTH = 1; //0-based -var umbraState = {'idleSince':null}; -var umbraAlreadyClicked = {}; -var umbraFinished = false; -var umbraIntervalFunc = function() { - - var umbraEmbeddedElements = []; - - getUmbraEmbeddedElements(umbraEmbeddedElements); - - var clickedSomething = false; - var somethingLeftBelow = false; - var somethingLeftAbove = false; - var missedAbove = 0; - - for (var i = 0; i < umbraEmbeddedElements.length; i++) { - - var targetId = umbraEmbeddedElements[i].id; - var target = umbraEmbeddedElements[i].target; - - if (!(targetId in umbraAlreadyClicked)) { - - var where = umbraAboveBelowOrOnScreen(target); - - if (where == 0) { // on screen - // var pos = target.getBoundingClientRect().top; - // window.scrollTo(0, target.getBoundingClientRect().top - 100); - console.log("clicking at " + target.getBoundingClientRect().top + " on " + target.outerHTML); - if (target.click != undefined) { - target.click(); - } - umbraAlreadyClicked[targetId] = true; - clickedSomething = true; - umbraState.idleSince = null; - break; - } else if (where > 0) { - somethingLeftBelow = true; - } else if (where < 0) { - somethingLeftAbove = true; - } - } - } - - if (!clickedSomething) { - if (somethingLeftAbove) { - console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); - window.scrollBy(0, -500); - umbraState.idleSince = null; - } else if (somethingLeftBelow) { - console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight); - window.scrollBy(0, 200); - umbraState.idleSince = null; - } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { - console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight); - window.scrollBy(0, 200); - umbraState.idleSince = null; - } else if (umbraState.idleSince == null) { - umbraState.idleSince = Date.now(); - } - } - - if (umbraState.idleSince == null) { - umbraState.idleSince = Date.now(); - } -} - -//try to detect sound cloud "Play" buttons and return them as targets for clicking -var getUmbraEmbeddedElements = function(embeddedElements, currentIframeDepth, currentDocument, - iframeElement) { - - //set default values for parameters - currentIframeDepth = currentIframeDepth || 0; - currentDocument = currentDocument || document; - - if (currentIframeDepth > MAX_IFRAME_RECURSE_DEPTH) { - return; - } - - //collect all buttons on current document first - var button = []; - - button = currentDocument.querySelectorAll(UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR); - - var cssPathIframe = iframeElement ? getElementCssPath(iframeElement) : ""; - - for (var i = 0; i < button.length; i++) { - embeddedElements.push({"id" : cssPathIframe + getElementCssPath(button.item(i)), "target" : button.item(i)}); - } - - //now get all buttons in embedded iframes - var iframe = []; - - iframe = currentDocument.querySelectorAll(UMBRA_IFRAME_EMBEDDED_SELECTOR); - - for (var i = 0; i < iframe.length; i++) { - getUmbraEmbeddedElements(embeddedElements, currentIframeDepth + 1, iframe[i].contentWindow.document.body, iframe[i]); - } -} - -// If we haven't had anything to do (scrolled, clicked, etc) in this amount of -// time, then we consider ourselves finished with the page. -var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10; - -// Called from outside of this script. -var umbraBehaviorFinished = function() { - if (umbraState.idleSince != null) { - var idleTimeMs = Date.now() - umbraState.idleSince; - if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { - clearInterval(umbraIntervalId) - return true; - } - } - return false; -} - -//copied from http://stackoverflow.com/questions/4588119/get-elements-css-selector-without-element-id -var getElementCssPath = function(element) { - - var names = []; - - while (element.parentNode){ - if (element.id){ - names.unshift('#' + element.id); - break; - } else { - if (element == element.ownerDocument.documentElement) { - names.unshift(element.tagName); - } - else { - for (var c = 1, e = element; e.previousElementSibling; e = e.previousElementSibling, c++); - - names.unshift(element.tagName + ":nth-child(" + c + ")"); - } - - element = element.parentNode; - } - } - - return names.join(" > "); -} - -var umbraIntervalId = setInterval(umbraIntervalFunc, 100); diff --git a/brozzler/js-templates/pm-ca.js b/brozzler/js-templates/pm-ca.js deleted file mode 100644 index f0aa653..0000000 --- a/brozzler/js-templates/pm-ca.js +++ /dev/null @@ -1,141 +0,0 @@ -/* - * brozzler/behaviors.d/pm-ca.js - behavior for http://www.pm.gc.ca/ - * - * Copyright (C) 2014-2017 Internet Archive - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -var umbraBehavior = { - IDLE_TIMEOUT_SEC : 10, - idleSince : null, - itemsText : "", - - // https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js - // n.b. returns true for elements with visibility:hidden, which occupy - // screen real estate but are not visible, or clickable with the ui - isVisible : function(elem) { - return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length); - }, - - intervalFunc : function() { - var clickedSomething = false; - var somethingLeftBelow = false; - var somethingLeftAbove = false; - var cssSelector = "div.teaser"; - var clickUntilTimeout = 10; - - var iframes = document.querySelectorAll("iframe"); - var documents = Array(iframes.length + 1); - documents[0] = document; - - for (var i = 0; i < iframes.length; i++) { - documents[i+1] = iframes[i].contentWindow.document; - } - - for (var j = 0; j < documents.length; j++) { - var clickTargets = documents[j].querySelectorAll(cssSelector); - for (var i = 0; i < clickTargets.length; i++) { - if (!this.isVisible(clickTargets[i])) { - continue; - } - if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) { - continue; - } - - var where = this.aboveBelowOrOnScreen(clickTargets[i]); - - if (where == 0) { - // console.log("clicking on " + clickTargets[i].outerHTML); - // do mouse over event on click target - // since some urls are requsted only on - // this event - see - // https://webarchive.jira.com/browse/AITFIVE-451 - var mouseOverEvent = document.createEvent('Events'); - mouseOverEvent.initEvent("mouseover",true, false); - clickTargets[i].dispatchEvent(mouseOverEvent); - clickTargets[i].click(); - clickedSomething = true; - this.idleSince = null; - this.itemsText += clickTargets[i].innerText; - - break; //break from clickTargets loop, but not from iframe loop - } else if (where > 0) { - somethingLeftBelow = true; - } else if (where < 0) { - somethingLeftAbove = true; - } - } - } - - if (!clickedSomething) { - if (somethingLeftAbove) { - // console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); - window.scrollBy(0, -500); - this.idleSince = null; - } else if (somethingLeftBelow) { - // console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" - // + document.body.clientHeight); - window.scrollBy(0, 200); - this.idleSince = null; - } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { - // console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" - // + document.body.clientHeight); - window.scrollBy(0, 200); - this.idleSince = null; - } else if (this.idleSince == null) { - this.idleSince = Date.now(); - } - } - - if (!this.idleSince) { - this.idleSince = Date.now(); - } - }, - - start : function() { - var that = this; - this.intervalId = setInterval(function() { - that.intervalFunc() - }, 500); - }, - - isFinished : function() { - if (this.idleSince != null) { - var idleTimeMs = Date.now() - this.idleSince; - if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) { - clearInterval(this.intervalId); - return true; - } - } - return false; - }, - - aboveBelowOrOnScreen : function(e) { - var eTop = e.getBoundingClientRect().top; - if (eTop < window.scrollY) { - return -1; // above - } else if (eTop > window.scrollY + window.innerHeight) { - return 1; // below - } else { - return 0; // on screen - } - }, -}; - -// Called from outside of this script. -var umbraBehaviorFinished = function() { - return umbraBehavior.isFinished() -}; - -umbraBehavior.start(); diff --git a/brozzler/js-templates/umbraBehavior.js.j2 b/brozzler/js-templates/umbraBehavior.js.j2 index 6ff51ef..6a0ae50 100644 --- a/brozzler/js-templates/umbraBehavior.js.j2 +++ b/brozzler/js-templates/umbraBehavior.js.j2 @@ -1,7 +1,11 @@ /* * brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class * +<<<<<<< HEAD * Copyright (C) 2017 Internet Archive +======= + * Copyright (C) 2017-2018 Internet Archive +>>>>>>> brofurb * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License.