Merge branch 'brofurb' into qa

This commit is contained in:
Barbara Miller 2018-01-15 19:52:38 -08:00
commit 2773c4ab6f
7 changed files with 36 additions and 354 deletions

View File

@ -68,29 +68,34 @@ logging._levelToName[TRACE] = 'TRACE'
logging._nameToLevel['TRACE'] = TRACE
_behaviors = None
def behaviors():
def behaviors(behaviors_dir=None):
"""Return list of JS behaviors loaded from YAML file.
:param behaviors_dir: Directory containing `behaviors.yaml` and
`js-templates/`. Defaults to brozzler dir.
"""
import os, yaml, string
global _behaviors
if _behaviors is None:
behaviors_yaml = os.path.join(
os.path.dirname(__file__), 'behaviors.yaml')
d = behaviors_dir or os.path.dirname(__file__)
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
with open(behaviors_yaml) as fin:
_behaviors = yaml.load(fin)
return _behaviors
def behavior_script(url, template_parameters=None):
def behavior_script(url, template_parameters=None, behaviors_dir=None):
'''
Returns the javascript behavior string populated with template_parameters.
'''
import re, logging
for behavior in behaviors():
for behavior in behaviors(behaviors_dir=behaviors_dir):
if re.match(behavior['url_regex'], url):
parameters = dict()
if 'default_parameters' in behavior:
parameters.update(behavior['default_parameters'])
if template_parameters:
parameters.update(template_parameters)
template = jinja2_environment().get_template(
template = jinja2_environment(behaviors_dir).get_template(
behavior['behavior_js_template'])
script = template.render(parameters)
logging.info(
@ -229,12 +234,16 @@ def sleep(duration):
time.sleep(min(duration - elapsed, 0.5))
_jinja2_env = None
def jinja2_environment():
def jinja2_environment(behaviors_dir=None):
global _jinja2_env
if not _jinja2_env:
import jinja2, json
_jinja2_env = jinja2.Environment(
loader=jinja2.PackageLoader('brozzler', 'js-templates'))
import os, jinja2, json
if behaviors_dir:
_loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir,
'js-templates'))
else:
_loader=jinja2.PackageLoader('brozzler', 'js-templates')
_jinja2_env = jinja2.Environment(loader=_loader)
_jinja2_env.filters['json'] = json.dumps
return _jinja2_env

View File

@ -1,7 +1,7 @@
#
# brozzler/behaviors.yaml - behavior configuration
#
# Copyright (C) 2014-2017 Internet Archive
# Copyright (C) 2014-2018 Internet Archive
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@ -57,6 +57,10 @@
click_css_selector: img.link-overlay
click_until_hard_timeout: False
request_idle_timeout_sec: 10
-
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
behavior_js_template: huffpostslides.js
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-5389
url_regex: '^https?://pitchfork\.com/.*$'
behavior_js_template: umbraBehavior.js.j2
@ -64,13 +68,6 @@
actions:
- selector: div.teaser, li.pager__item a
closeSelector: .pmf-artist-modal__close-btn
-
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
behavior_js_template: umbraBehavior.js.j2
default_parameters:
actions:
- selector: .slideshow-card__overlay
- selector: .slideshow__next
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
behavior_js_template: simpleclicks.js.j2
@ -157,13 +154,6 @@
actions:
- selector: .menu-item a
do: mouseover
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
behavior_js_template: mouseovers.js.j2
default_parameters:
sdo_css_selector: .menu-item a
sdo_action: mouseover
sdo_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-5259
url_regex: '^https?://blog\.sina\.com\.cn/.*$'
behavior_js_template: simpleclicks.js.j2
@ -213,13 +203,6 @@
click_css_selector: img.link-overlay
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-5389
url_regex: '^https?://pitchfork\.com/.*$'
behavior_js_template: pitchfork.js
- # https://webarchive.jira.com/browse/ARI-5379
url_regex: '^https?://(?:www\.)?pm\.gc\.ca/.*$'
behavior_js_template: pm-ca.js
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4960
url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
behavior_js_template: simpleclicks.js.j2
@ -239,4 +222,4 @@
behavior_js_template: umbraBehavior.js.j2
default_parameters:
actions:
- selector: button.sc-button-play, .playButton, div.soundItem, .jwlist>a, .ytp-button
- selector: button.sc-button-play, .playButton, div.soundItem, .jwlist>a

View File

@ -382,7 +382,7 @@ class Browser:
def browse_page(
self, page_url, extra_headers=None,
user_agent=None, behavior_parameters=None,
user_agent=None, behavior_parameters=None, behaviors_dir=None,
on_request=None, on_response=None, on_screenshot=None,
username=None, password=None, hashtags=None,
skip_extract_outlinks=False, skip_visit_hashtags=False,
@ -402,6 +402,8 @@ class Browser:
supplied (default None)
behavior_parameters: dict of parameters for populating the
javascript behavior template (default None)
behaviors_dir: Directory containing behaviors.yaml and JS templates
(default None loads Brozzler default JS behaviors)
on_request: callback to invoke on every Network.requestWillBeSent
event, takes one argument, the json-decoded message (default
None)
@ -452,7 +454,8 @@ class Browser:
jpeg_bytes = self.screenshot()
on_screenshot(jpeg_bytes)
behavior_script = brozzler.behavior_script(
page_url, behavior_parameters)
page_url, behavior_parameters,
behaviors_dir=behaviors_dir)
self.run_behavior(behavior_script, timeout=behavior_timeout)
if skip_extract_outlinks:
outlinks = []

View File

@ -126,6 +126,7 @@ class Chrome:
'--remote-debugging-port=%s' % self.port,
'--use-mock-keychain', # mac thing
'--user-data-dir=%s' % self._chrome_user_data_dir,
'--disable-background-networking',
'--disable-web-sockets', '--disable-cache',
'--window-size=1100,900', '--no-default-browser-check',
'--disable-first-run-ui', '--no-first-run',
@ -277,13 +278,13 @@ class Chrome:
'chrome pid %s reaped (status=%s) after killing with '
'SIGKILL', self.chrome_process.pid, status)
finally:
try:
self._home_tmpdir.cleanup()
except:
self.logger.error(
'exception deleting %s', self._home_tmpdir,
exc_info=True)
finally:
self._out_reader_thread.join()
self.chrome_process = None

View File

@ -1,177 +0,0 @@
/*
* brozzler/behaviors.d/default.js - default behavior, scrolls to the bottom of
* the page and clicks on selected embedded elements
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraAboveBelowOrOnScreen = function(e) {
var eTop = e.getBoundingClientRect().top;
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
return 1; // below
} else {
return 0; // on screen
}
}
var UMBRA_IFRAME_EMBEDDED_SELECTOR = "iframe";
//elements selected for SoundCloud.com
var UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR = "button.sc-button-play, .playButton, div.soundItem";
//elements selected for Archive.org Playlists
UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR += ", .jwlist>a"
var MAX_IFRAME_RECURSE_DEPTH = 1; //0-based
var umbraState = {'idleSince':null};
var umbraAlreadyClicked = {};
var umbraFinished = false;
var umbraIntervalFunc = function() {
var umbraEmbeddedElements = [];
getUmbraEmbeddedElements(umbraEmbeddedElements);
var clickedSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var missedAbove = 0;
for (var i = 0; i < umbraEmbeddedElements.length; i++) {
var targetId = umbraEmbeddedElements[i].id;
var target = umbraEmbeddedElements[i].target;
if (!(targetId in umbraAlreadyClicked)) {
var where = umbraAboveBelowOrOnScreen(target);
if (where == 0) { // on screen
// var pos = target.getBoundingClientRect().top;
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
console.log("clicking at " + target.getBoundingClientRect().top + " on " + target.outerHTML);
if (target.click != undefined) {
target.click();
}
umbraAlreadyClicked[targetId] = true;
clickedSomething = true;
umbraState.idleSince = null;
break;
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}
if (!clickedSomething) {
if (somethingLeftAbove) {
console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
window.scrollBy(0, -500);
umbraState.idleSince = null;
} else if (somethingLeftBelow) {
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 200);
umbraState.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 200);
umbraState.idleSince = null;
} else if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now();
}
}
if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now();
}
}
//try to detect sound cloud "Play" buttons and return them as targets for clicking
var getUmbraEmbeddedElements = function(embeddedElements, currentIframeDepth, currentDocument,
iframeElement) {
//set default values for parameters
currentIframeDepth = currentIframeDepth || 0;
currentDocument = currentDocument || document;
if (currentIframeDepth > MAX_IFRAME_RECURSE_DEPTH) {
return;
}
//collect all buttons on current document first
var button = [];
button = currentDocument.querySelectorAll(UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR);
var cssPathIframe = iframeElement ? getElementCssPath(iframeElement) : "";
for (var i = 0; i < button.length; i++) {
embeddedElements.push({"id" : cssPathIframe + getElementCssPath(button.item(i)), "target" : button.item(i)});
}
//now get all buttons in embedded iframes
var iframe = [];
iframe = currentDocument.querySelectorAll(UMBRA_IFRAME_EMBEDDED_SELECTOR);
for (var i = 0; i < iframe.length; i++) {
getUmbraEmbeddedElements(embeddedElements, currentIframeDepth + 1, iframe[i].contentWindow.document.body, iframe[i]);
}
}
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
// time, then we consider ourselves finished with the page.
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
// Called from outside of this script.
var umbraBehaviorFinished = function() {
if (umbraState.idleSince != null) {
var idleTimeMs = Date.now() - umbraState.idleSince;
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
clearInterval(umbraIntervalId)
return true;
}
}
return false;
}
//copied from http://stackoverflow.com/questions/4588119/get-elements-css-selector-without-element-id
var getElementCssPath = function(element) {
var names = [];
while (element.parentNode){
if (element.id){
names.unshift('#' + element.id);
break;
} else {
if (element == element.ownerDocument.documentElement) {
names.unshift(element.tagName);
}
else {
for (var c = 1, e = element; e.previousElementSibling; e = e.previousElementSibling, c++);
names.unshift(element.tagName + ":nth-child(" + c + ")");
}
element = element.parentNode;
}
}
return names.join(" > ");
}
var umbraIntervalId = setInterval(umbraIntervalFunc, 100);

View File

@ -1,141 +0,0 @@
/*
* brozzler/behaviors.d/pm-ca.js - behavior for http://www.pm.gc.ca/
*
* Copyright (C) 2014-2017 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraBehavior = {
IDLE_TIMEOUT_SEC : 10,
idleSince : null,
itemsText : "",
// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
// n.b. returns true for elements with visibility:hidden, which occupy
// screen real estate but are not visible, or clickable with the ui
isVisible : function(elem) {
return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
},
intervalFunc : function() {
var clickedSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var cssSelector = "div.teaser";
var clickUntilTimeout = 10;
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);
documents[0] = document;
for (var i = 0; i < iframes.length; i++) {
documents[i+1] = iframes[i].contentWindow.document;
}
for (var j = 0; j < documents.length; j++) {
var clickTargets = documents[j].querySelectorAll(cssSelector);
for (var i = 0; i < clickTargets.length; i++) {
if (!this.isVisible(clickTargets[i])) {
continue;
}
if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
continue;
}
var where = this.aboveBelowOrOnScreen(clickTargets[i]);
if (where == 0) {
// console.log("clicking on " + clickTargets[i].outerHTML);
// do mouse over event on click target
// since some urls are requsted only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
clickTargets[i].dispatchEvent(mouseOverEvent);
clickTargets[i].click();
clickedSomething = true;
this.idleSince = null;
this.itemsText += clickTargets[i].innerText;
break; //break from clickTargets loop, but not from iframe loop
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}
if (!clickedSomething) {
if (somethingLeftAbove) {
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
window.scrollBy(0, -500);
this.idleSince = null;
} else if (somethingLeftBelow) {
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (this.idleSince == null) {
this.idleSince = Date.now();
}
}
if (!this.idleSince) {
this.idleSince = Date.now();
}
},
start : function() {
var that = this;
this.intervalId = setInterval(function() {
that.intervalFunc()
}, 500);
},
isFinished : function() {
if (this.idleSince != null) {
var idleTimeMs = Date.now() - this.idleSince;
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
clearInterval(this.intervalId);
return true;
}
}
return false;
},
aboveBelowOrOnScreen : function(e) {
var eTop = e.getBoundingClientRect().top;
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
return 1; // below
} else {
return 0; // on screen
}
},
};
// Called from outside of this script.
var umbraBehaviorFinished = function() {
return umbraBehavior.isFinished()
};
umbraBehavior.start();

View File

@ -1,7 +1,11 @@
/*
* brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class
*
<<<<<<< HEAD
* Copyright (C) 2017 Internet Archive
=======
* Copyright (C) 2017-2018 Internet Archive
>>>>>>> brofurb
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.