mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 16:16:28 -04:00
Merge branch 'brofurb' into qa
This commit is contained in:
commit
2773c4ab6f
@ -68,29 +68,34 @@ logging._levelToName[TRACE] = 'TRACE'
|
||||
logging._nameToLevel['TRACE'] = TRACE
|
||||
|
||||
_behaviors = None
|
||||
def behaviors():
|
||||
def behaviors(behaviors_dir=None):
|
||||
"""Return list of JS behaviors loaded from YAML file.
|
||||
|
||||
:param behaviors_dir: Directory containing `behaviors.yaml` and
|
||||
`js-templates/`. Defaults to brozzler dir.
|
||||
"""
|
||||
import os, yaml, string
|
||||
global _behaviors
|
||||
if _behaviors is None:
|
||||
behaviors_yaml = os.path.join(
|
||||
os.path.dirname(__file__), 'behaviors.yaml')
|
||||
d = behaviors_dir or os.path.dirname(__file__)
|
||||
behaviors_yaml = os.path.join(d, 'behaviors.yaml')
|
||||
with open(behaviors_yaml) as fin:
|
||||
_behaviors = yaml.load(fin)
|
||||
return _behaviors
|
||||
|
||||
def behavior_script(url, template_parameters=None):
|
||||
def behavior_script(url, template_parameters=None, behaviors_dir=None):
|
||||
'''
|
||||
Returns the javascript behavior string populated with template_parameters.
|
||||
'''
|
||||
import re, logging
|
||||
for behavior in behaviors():
|
||||
for behavior in behaviors(behaviors_dir=behaviors_dir):
|
||||
if re.match(behavior['url_regex'], url):
|
||||
parameters = dict()
|
||||
if 'default_parameters' in behavior:
|
||||
parameters.update(behavior['default_parameters'])
|
||||
if template_parameters:
|
||||
parameters.update(template_parameters)
|
||||
template = jinja2_environment().get_template(
|
||||
template = jinja2_environment(behaviors_dir).get_template(
|
||||
behavior['behavior_js_template'])
|
||||
script = template.render(parameters)
|
||||
logging.info(
|
||||
@ -229,12 +234,16 @@ def sleep(duration):
|
||||
time.sleep(min(duration - elapsed, 0.5))
|
||||
|
||||
_jinja2_env = None
|
||||
def jinja2_environment():
|
||||
def jinja2_environment(behaviors_dir=None):
|
||||
global _jinja2_env
|
||||
if not _jinja2_env:
|
||||
import jinja2, json
|
||||
_jinja2_env = jinja2.Environment(
|
||||
loader=jinja2.PackageLoader('brozzler', 'js-templates'))
|
||||
import os, jinja2, json
|
||||
if behaviors_dir:
|
||||
_loader = jinja2.FileSystemLoader(os.path.join(behaviors_dir,
|
||||
'js-templates'))
|
||||
else:
|
||||
_loader=jinja2.PackageLoader('brozzler', 'js-templates')
|
||||
_jinja2_env = jinja2.Environment(loader=_loader)
|
||||
_jinja2_env.filters['json'] = json.dumps
|
||||
return _jinja2_env
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
#
|
||||
# brozzler/behaviors.yaml - behavior configuration
|
||||
#
|
||||
# Copyright (C) 2014-2017 Internet Archive
|
||||
# Copyright (C) 2014-2018 Internet Archive
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@ -57,6 +57,10 @@
|
||||
click_css_selector: img.link-overlay
|
||||
click_until_hard_timeout: False
|
||||
request_idle_timeout_sec: 10
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
|
||||
behavior_js_template: huffpostslides.js
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-5389
|
||||
url_regex: '^https?://pitchfork\.com/.*$'
|
||||
behavior_js_template: umbraBehavior.js.j2
|
||||
@ -64,13 +68,6 @@
|
||||
actions:
|
||||
- selector: div.teaser, li.pager__item a
|
||||
closeSelector: .pmf-artist-modal__close-btn
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
|
||||
behavior_js_template: umbraBehavior.js.j2
|
||||
default_parameters:
|
||||
actions:
|
||||
- selector: .slideshow-card__overlay
|
||||
- selector: .slideshow__next
|
||||
-
|
||||
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
@ -157,13 +154,6 @@
|
||||
actions:
|
||||
- selector: .menu-item a
|
||||
do: mouseover
|
||||
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
|
||||
behavior_js_template: mouseovers.js.j2
|
||||
default_parameters:
|
||||
sdo_css_selector: .menu-item a
|
||||
sdo_action: mouseover
|
||||
sdo_until_hard_timeout: False
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-5259
|
||||
url_regex: '^https?://blog\.sina\.com\.cn/.*$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
@ -213,13 +203,6 @@
|
||||
click_css_selector: img.link-overlay
|
||||
click_until_hard_timeout: False
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-5389
|
||||
url_regex: '^https?://pitchfork\.com/.*$'
|
||||
behavior_js_template: pitchfork.js
|
||||
- # https://webarchive.jira.com/browse/ARI-5379
|
||||
url_regex: '^https?://(?:www\.)?pm\.gc\.ca/.*$'
|
||||
behavior_js_template: pm-ca.js
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4960
|
||||
url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
@ -239,4 +222,4 @@
|
||||
behavior_js_template: umbraBehavior.js.j2
|
||||
default_parameters:
|
||||
actions:
|
||||
- selector: button.sc-button-play, .playButton, div.soundItem, .jwlist>a, .ytp-button
|
||||
- selector: button.sc-button-play, .playButton, div.soundItem, .jwlist>a
|
||||
|
@ -382,7 +382,7 @@ class Browser:
|
||||
|
||||
def browse_page(
|
||||
self, page_url, extra_headers=None,
|
||||
user_agent=None, behavior_parameters=None,
|
||||
user_agent=None, behavior_parameters=None, behaviors_dir=None,
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
username=None, password=None, hashtags=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
@ -402,6 +402,8 @@ class Browser:
|
||||
supplied (default None)
|
||||
behavior_parameters: dict of parameters for populating the
|
||||
javascript behavior template (default None)
|
||||
behaviors_dir: Directory containing behaviors.yaml and JS templates
|
||||
(default None loads Brozzler default JS behaviors)
|
||||
on_request: callback to invoke on every Network.requestWillBeSent
|
||||
event, takes one argument, the json-decoded message (default
|
||||
None)
|
||||
@ -452,7 +454,8 @@ class Browser:
|
||||
jpeg_bytes = self.screenshot()
|
||||
on_screenshot(jpeg_bytes)
|
||||
behavior_script = brozzler.behavior_script(
|
||||
page_url, behavior_parameters)
|
||||
page_url, behavior_parameters,
|
||||
behaviors_dir=behaviors_dir)
|
||||
self.run_behavior(behavior_script, timeout=behavior_timeout)
|
||||
if skip_extract_outlinks:
|
||||
outlinks = []
|
||||
|
@ -126,6 +126,7 @@ class Chrome:
|
||||
'--remote-debugging-port=%s' % self.port,
|
||||
'--use-mock-keychain', # mac thing
|
||||
'--user-data-dir=%s' % self._chrome_user_data_dir,
|
||||
'--disable-background-networking',
|
||||
'--disable-web-sockets', '--disable-cache',
|
||||
'--window-size=1100,900', '--no-default-browser-check',
|
||||
'--disable-first-run-ui', '--no-first-run',
|
||||
@ -277,13 +278,13 @@ class Chrome:
|
||||
'chrome pid %s reaped (status=%s) after killing with '
|
||||
'SIGKILL', self.chrome_process.pid, status)
|
||||
|
||||
finally:
|
||||
try:
|
||||
self._home_tmpdir.cleanup()
|
||||
except:
|
||||
self.logger.error(
|
||||
'exception deleting %s', self._home_tmpdir,
|
||||
exc_info=True)
|
||||
finally:
|
||||
self._out_reader_thread.join()
|
||||
self.chrome_process = None
|
||||
|
||||
|
@ -1,177 +0,0 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/default.js - default behavior, scrolls to the bottom of
|
||||
* the page and clicks on selected embedded elements
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraAboveBelowOrOnScreen = function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||
return 1; // below
|
||||
} else {
|
||||
return 0; // on screen
|
||||
}
|
||||
}
|
||||
|
||||
var UMBRA_IFRAME_EMBEDDED_SELECTOR = "iframe";
|
||||
//elements selected for SoundCloud.com
|
||||
var UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR = "button.sc-button-play, .playButton, div.soundItem";
|
||||
//elements selected for Archive.org Playlists
|
||||
UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR += ", .jwlist>a"
|
||||
var MAX_IFRAME_RECURSE_DEPTH = 1; //0-based
|
||||
var umbraState = {'idleSince':null};
|
||||
var umbraAlreadyClicked = {};
|
||||
var umbraFinished = false;
|
||||
var umbraIntervalFunc = function() {
|
||||
|
||||
var umbraEmbeddedElements = [];
|
||||
|
||||
getUmbraEmbeddedElements(umbraEmbeddedElements);
|
||||
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var somethingLeftAbove = false;
|
||||
var missedAbove = 0;
|
||||
|
||||
for (var i = 0; i < umbraEmbeddedElements.length; i++) {
|
||||
|
||||
var targetId = umbraEmbeddedElements[i].id;
|
||||
var target = umbraEmbeddedElements[i].target;
|
||||
|
||||
if (!(targetId in umbraAlreadyClicked)) {
|
||||
|
||||
var where = umbraAboveBelowOrOnScreen(target);
|
||||
|
||||
if (where == 0) { // on screen
|
||||
// var pos = target.getBoundingClientRect().top;
|
||||
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
||||
console.log("clicking at " + target.getBoundingClientRect().top + " on " + target.outerHTML);
|
||||
if (target.click != undefined) {
|
||||
target.click();
|
||||
}
|
||||
umbraAlreadyClicked[targetId] = true;
|
||||
clickedSomething = true;
|
||||
umbraState.idleSince = null;
|
||||
break;
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||
window.scrollBy(0, -500);
|
||||
umbraState.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
umbraState.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
umbraState.idleSince = null;
|
||||
} else if (umbraState.idleSince == null) {
|
||||
umbraState.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
if (umbraState.idleSince == null) {
|
||||
umbraState.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
//try to detect sound cloud "Play" buttons and return them as targets for clicking
|
||||
var getUmbraEmbeddedElements = function(embeddedElements, currentIframeDepth, currentDocument,
|
||||
iframeElement) {
|
||||
|
||||
//set default values for parameters
|
||||
currentIframeDepth = currentIframeDepth || 0;
|
||||
currentDocument = currentDocument || document;
|
||||
|
||||
if (currentIframeDepth > MAX_IFRAME_RECURSE_DEPTH) {
|
||||
return;
|
||||
}
|
||||
|
||||
//collect all buttons on current document first
|
||||
var button = [];
|
||||
|
||||
button = currentDocument.querySelectorAll(UMBRA_THINGS_TO_CLICK_EMBEDDED_SELECTOR);
|
||||
|
||||
var cssPathIframe = iframeElement ? getElementCssPath(iframeElement) : "";
|
||||
|
||||
for (var i = 0; i < button.length; i++) {
|
||||
embeddedElements.push({"id" : cssPathIframe + getElementCssPath(button.item(i)), "target" : button.item(i)});
|
||||
}
|
||||
|
||||
//now get all buttons in embedded iframes
|
||||
var iframe = [];
|
||||
|
||||
iframe = currentDocument.querySelectorAll(UMBRA_IFRAME_EMBEDDED_SELECTOR);
|
||||
|
||||
for (var i = 0; i < iframe.length; i++) {
|
||||
getUmbraEmbeddedElements(embeddedElements, currentIframeDepth + 1, iframe[i].contentWindow.document.body, iframe[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||
// time, then we consider ourselves finished with the page.
|
||||
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
if (umbraState.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||
clearInterval(umbraIntervalId)
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
//copied from http://stackoverflow.com/questions/4588119/get-elements-css-selector-without-element-id
|
||||
var getElementCssPath = function(element) {
|
||||
|
||||
var names = [];
|
||||
|
||||
while (element.parentNode){
|
||||
if (element.id){
|
||||
names.unshift('#' + element.id);
|
||||
break;
|
||||
} else {
|
||||
if (element == element.ownerDocument.documentElement) {
|
||||
names.unshift(element.tagName);
|
||||
}
|
||||
else {
|
||||
for (var c = 1, e = element; e.previousElementSibling; e = e.previousElementSibling, c++);
|
||||
|
||||
names.unshift(element.tagName + ":nth-child(" + c + ")");
|
||||
}
|
||||
|
||||
element = element.parentNode;
|
||||
}
|
||||
}
|
||||
|
||||
return names.join(" > ");
|
||||
}
|
||||
|
||||
var umbraIntervalId = setInterval(umbraIntervalFunc, 100);
|
@ -1,141 +0,0 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/pm-ca.js - behavior for http://www.pm.gc.ca/
|
||||
*
|
||||
* Copyright (C) 2014-2017 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraBehavior = {
|
||||
IDLE_TIMEOUT_SEC : 10,
|
||||
idleSince : null,
|
||||
itemsText : "",
|
||||
|
||||
// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
|
||||
// n.b. returns true for elements with visibility:hidden, which occupy
|
||||
// screen real estate but are not visible, or clickable with the ui
|
||||
isVisible : function(elem) {
|
||||
return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
|
||||
},
|
||||
|
||||
intervalFunc : function() {
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var somethingLeftAbove = false;
|
||||
var cssSelector = "div.teaser";
|
||||
var clickUntilTimeout = 10;
|
||||
|
||||
var iframes = document.querySelectorAll("iframe");
|
||||
var documents = Array(iframes.length + 1);
|
||||
documents[0] = document;
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
documents[i+1] = iframes[i].contentWindow.document;
|
||||
}
|
||||
|
||||
for (var j = 0; j < documents.length; j++) {
|
||||
var clickTargets = documents[j].querySelectorAll(cssSelector);
|
||||
for (var i = 0; i < clickTargets.length; i++) {
|
||||
if (!this.isVisible(clickTargets[i])) {
|
||||
continue;
|
||||
}
|
||||
if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var where = this.aboveBelowOrOnScreen(clickTargets[i]);
|
||||
|
||||
if (where == 0) {
|
||||
// console.log("clicking on " + clickTargets[i].outerHTML);
|
||||
// do mouse over event on click target
|
||||
// since some urls are requsted only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
clickTargets[i].dispatchEvent(mouseOverEvent);
|
||||
clickTargets[i].click();
|
||||
clickedSomething = true;
|
||||
this.idleSince = null;
|
||||
this.itemsText += clickTargets[i].innerText;
|
||||
|
||||
break; //break from clickTargets loop, but not from iframe loop
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||
window.scrollBy(0, -500);
|
||||
this.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (this.idleSince == null) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.idleSince) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
},
|
||||
|
||||
start : function() {
|
||||
var that = this;
|
||||
this.intervalId = setInterval(function() {
|
||||
that.intervalFunc()
|
||||
}, 500);
|
||||
},
|
||||
|
||||
isFinished : function() {
|
||||
if (this.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - this.idleSince;
|
||||
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
|
||||
clearInterval(this.intervalId);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
},
|
||||
|
||||
aboveBelowOrOnScreen : function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||
return 1; // below
|
||||
} else {
|
||||
return 0; // on screen
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
return umbraBehavior.isFinished()
|
||||
};
|
||||
|
||||
umbraBehavior.start();
|
@ -1,7 +1,11 @@
|
||||
/*
|
||||
* brozzler/js-templates/umbrabehavior.js.j2 - an umbra/brozzler behavior class
|
||||
*
|
||||
<<<<<<< HEAD
|
||||
* Copyright (C) 2017 Internet Archive
|
||||
=======
|
||||
* Copyright (C) 2017-2018 Internet Archive
|
||||
>>>>>>> brofurb
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
Loading…
x
Reference in New Issue
Block a user