copy over latest behaviors and stuff from umbra

This commit is contained in:
Noah Levitt 2016-05-05 00:58:26 -07:00
parent 0af00bb3d5
commit cea192b4b3
6 changed files with 283 additions and 46 deletions

View File

@ -0,0 +1,206 @@
/*
* brozzler/behaviors.d/facebook.js - facebook behavior, scrolls to the bottom
* of the page, clicks to expand images, a few other things
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraAboveBelowOrOnScreen = function(e) {
var eTop = e.getBoundingClientRect().top;
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
// if (e.clientWidth != 0) {
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be below the screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
// }
return 1; // below
} else {
// if (e.clientWidth != 0) {
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be on screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
// }
return 0; // on screen
}
}
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
var UMBRA_FB_USER_NAME = "${parameter_username}";
var UMBRA_FB_PASSWORD = "${parameter_password}";
var umbraAlreadyClicked = {};
var umbraAlreadyScrolledThing = {};
var umbraScrolledThingFailedScrollAttempts = {};
var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0};
var umbraIntervalFunc = function() {
var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR);
var everythingScrolled = true;
for (var i = 0; i < thingsToScroll.length; i++) {
var target = thingsToScroll[i];
if (!(target in umbraAlreadyScrolledThing)) {
everythingScrolled = false;
console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id);
var lastScrollTop = target.scrollTop;
target.scrollTop = target.scrollHeight;
umbraState.idleSince = null;
if (target.scrollTop >= target.scrollHeight) {
umbraAlreadyScrolledThing[target] = true;
}
else if (target.scrollTop == lastScrollTop) {
if (umbraScrolledThingFailedScrollAttempts[target]) {
umbraScrolledThingFailedScrollAttempts[target]++;
}
else {
umbraScrolledThingFailedScrollAttempts[target] = 1;
}
if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) {
umbraAlreadyScrolledThing[target] = true;
}
}
else {
//reset failed count on a successful scroll
umbraScrolledThingFailedScrollAttempts[target] = 0;
}
}
else {
console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id)
}
umbraState.expectingSomething = null;
}
if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) {
if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now();
}
return;
}
var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"], div.fbPhotoSnowlift.fbxPhoto a._xlt');
for (var i = 0; i < closeButtons.length; i++) {
// XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible
if (closeButtons[i].clientWidth > 0) {
if (umbraState.expectingSomething == 'closeButton') {
console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML);
umbraState.expectingSomething = null;
} else {
console.warn("found UNexpected close button, umbraState.expectingSomething=" + umbraState.expectingSomething + " ... clicking on it " + closeButtons[i].outerHTML);
}
closeButtons[i].click();
return;
}
}
if (umbraState.expectingSomething == 'closeButton') {
console.log("waiting for close button, haven't seen it yet");
return;
}
var thingsToClick = document.querySelectorAll(UMBRA_THINGS_TO_CLICK_SELECTOR);
var clickedSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var missedAbove = 0;
for (var i = 0; i < thingsToClick.length; i++) {
var target = thingsToClick[i];
if (!(target in umbraAlreadyClicked)) {
var where = umbraAboveBelowOrOnScreen(target);
if (where == 0) { // on screen
// var pos = target.getBoundingClientRect().top;
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
console.log("clicking at " + target.getBoundingClientRect().top + " on " + target.outerHTML);
if (target.click != undefined) {
umbraState.expectingSomething = 'closeButton';
target.click();
}
target.style.border = '1px solid #0a0';
umbraAlreadyClicked[target] = true;
clickedSomething = true;
umbraState.idleSince = null;
break;
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}
if (window.scrollY > umbraState.bottomReachedScrollY) {
umbraState.bottomReachedScrollY = window.scrollY;
}
if (!clickedSomething) {
if (somethingLeftBelow) {
// console.log("scrolling down because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 300);
umbraState.idleSince = null;
} else if (umbraState.bottomReachedScrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling down because we haven't reached the bottom yet document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 300);
umbraState.idleSince = null;
} else if (somethingLeftAbove) {
// console.log("scrolling UP because we've already been to the bottom, everything on or below this screen has been clicked, but we missed something above");
window.scrollBy(0, -600);
umbraState.idleSince = null;
} else if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now();
}
}
}
var umbraFacebookLogin = function() {
var emailInput = document.querySelector("form#login_form input#email");
var passwordInput = document.querySelector("form#login_form input#pass");
var loginButton = document.querySelector("form#login_form label#loginbutton > input");
emailInput.value=UMBRA_FB_USER_NAME;
passwordInput.value=UMBRA_FB_PASSWORD;
loginButton.click();
}
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
// time, then we consider ourselves finished with the page.
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
// Called from outside of this script.
var umbraBehaviorFinished = function() {
if (umbraState.idleSince != null) {
var idleTimeMs = Date.now() - umbraState.idleSince;
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
return true;
}
}
return false;
}
if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
}
else //login
umbraFacebookLogin();

View File

@ -42,43 +42,49 @@ class Behavior:
conf = yaml.load(fin)
Behavior._behaviors = conf['behaviors']
simpleclicks_js_in = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + ["simpleclicks.js.in"])
with open(simpleclicks_js_in) as fin:
simpleclicks_js_template = string.Template(fin.read())
for behavior in Behavior._behaviors:
if "behavior_js" in behavior:
behavior_js = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js"]])
behavior["script"] = open(behavior_js, encoding="utf-8").read()
elif "click_css_selector" in behavior:
if "click_until_hard_timeout" in behavior:
click_until_hard_timeout_value=behavior["click_until_hard_timeout"]
else:
click_until_hard_timeout_value = False
behavior["script"] = simpleclicks_js_template.substitute(click_css_selector=behavior["click_css_selector"], click_until_hard_timeout=click_until_hard_timeout_value)
with open(behavior_js, encoding="utf-8") as fin:
behavior["script"] = fin.read()
elif "behavior_js_template" in behavior:
behavior_js_template = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js_template"]])
with open(behavior_js_template, encoding="utf-8") as fin:
behavior["template"] = string.Template(fin.read())
return Behavior._behaviors
def __init__(self, url, umbra_worker):
self.url = url
self.umbra_worker = umbra_worker
self.script_finished = False
self.waiting_result_msg_ids = []
self.active_behavior = None
self.last_activity = time.time()
def start(self):
def start(self, template_parameters=None):
for behavior in Behavior.behaviors():
if re.match(behavior['url_regex'], self.url):
if "behavior_js" in behavior:
self.logger.info("using {} behavior for {}".format(behavior["behavior_js"], self.url))
elif "click_css_selector" in behavior:
self.logger.info("using simple click behavior with css selector {} for {}".format(behavior["click_css_selector"], self.url))
self.logger.info("using %s behavior for %s",
behavior["behavior_js"], self.url)
elif "behavior_js_template" in behavior:
parameters = dict()
if "default_parameters" in behavior:
parameters.update(behavior["default_parameters"])
if template_parameters:
parameters.update(template_parameters)
behavior["script"] = behavior["template"].safe_substitute(parameters)
self.logger.info(
"using template=%s populated with parameters=%s for %s",
repr(behavior["behavior_js_template"]),
parameters, self.url)
self.active_behavior = behavior
self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
suppress_logging=True, params={"expression": behavior["script"]})
self.umbra_worker.send_to_chrome(
method="Runtime.evaluate", suppress_logging=True,
params={"expression": behavior["script"]})
self.notify_of_activity()
return

View File

@ -20,7 +20,10 @@
behaviors:
-
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
behavior_js: facebook.js
behavior_js_template: facebook.js.template
# default_parameters:
# parameter_username: jdoe@example.com
# parameter_password: abcd1234
request_idle_timeout_sec: 30
-
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
@ -42,36 +45,50 @@ behaviors:
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
behavior_js: instagram.js
request_idle_timeout_sec: 10
-
-
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
click_css_selector: img.img-responsive
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: img.img-responsive
request_idle_timeout_sec: 10
- # acalog https://webarchive.jira.com/browse/ARI-3775
url_regex: '^https?://.*[?&]catoid=[^?]*$'
click_css_selector: a[onclick]
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[onclick]
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-3956
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
click_css_selector: a[id='feature-next']
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: a[id='feature-next']
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-451
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
click_css_selector: button.sc-button-play, button.playButton
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.sc-button-play, button.playButton
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/AITFIVE-463
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
click_css_selector: button.playButton.medium
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button.playButton.medium
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
click_css_selector: span.load-more-text
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: span.load-more-text
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4725
url_regex: '^https?://(?:www\.)?moma.org/.*$'
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # default fallback brhavior
behavior_js_template: simpleclicks.js.template
default_parameters:
click_css_selector: button[data-more-results-bottom-button]
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10
behavior_js: default.js

View File

@ -157,8 +157,10 @@ class Browser:
def abort_browse_page(self):
self._abort_browse_page = True
def browse_page(self, url, extra_headers=None, on_request=None,
on_screenshot=None, on_url_change=None):
def browse_page(
self, url, extra_headers=None, behavior_parameters=None,
on_request=None, on_response=None, on_screenshot=None,
on_url_change=None):
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
Raises BrowsingException if browsing the page fails in a non-critical
@ -173,6 +175,8 @@ class Browser:
self.on_request = on_request
self.on_screenshot = on_screenshot
self.on_url_change = on_url_change
self.on_response = on_response
self.behavior_parameters = behavior_parameters
self._waiting_on_screenshot_msg_id = None
self._waiting_on_document_url_msg_id = None
@ -301,10 +305,15 @@ class Browser:
def _network_response_received(self, message):
if (not self._reached_limit
and message["params"]["response"]["status"] == 420
and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
warcprox_meta = json.loads(CaseInsensitiveDict(message["params"]["response"]["headers"])["Warcprox-Meta"])
self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
and "Warcprox-Meta" in CaseInsensitiveDict(
message["params"]["response"]["headers"])):
warcprox_meta = json.loads(CaseInsensitiveDict(
message["params"]["response"]["headers"])["Warcprox-Meta"])
self._reached_limit = brozzler.ReachedLimit(
warcprox_meta=warcprox_meta)
self.logger.info("reached limit %s", self._reached_limit)
if self.on_response:
self.on_response(message)
def _page_load_event_fired(self, message):
self.logger.info("Page.loadEventFired, requesting screenshot url={} message={}".format(self.url, message))
@ -335,7 +344,7 @@ class Browser:
self._waiting_on_screenshot_msg_id = None
self.logger.info("got screenshot, moving on to starting behaviors url={}".format(self.url))
self._behavior = Behavior(self.url, self)
self._behavior.start()
self._behavior.start(self.behavior_parameters)
elif message["id"] == self._waiting_on_outlinks_msg_id:
self.logger.debug("got outlinks message=%s", message)
self._outlinks = frozenset(message["result"]["result"]["value"].split(" "))
@ -347,10 +356,8 @@ class Browser:
elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
self._behavior.notify_of_result(message)
def _handle_message(self, websock, message):
# self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
# self.logger.debug("message from {} - {}".format(websock.url, message))
message = json.loads(message)
def _handle_message(self, websock, json_message):
message = json.loads(json_message)
if "method" in message and message["method"] == "Network.requestWillBeSent":
self._network_request_will_be_sent(message)
elif "method" in message and message["method"] == "Network.responseReceived":
@ -368,9 +375,9 @@ class Browser:
# elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"):
# pass
# elif "method" in message:
# self.logger.debug("{} {}".format(message["method"], message))
# self.logger.debug("{} {}".format(message["method"], json_message))
# else:
# self.logger.debug("[no-method] {}".format(message))
# self.logger.debug("[no-method] {}".format(json_message))
class Chrome:
logger = logging.getLogger(__module__ + "." + __qualname__)
@ -396,15 +403,16 @@ class Chrome:
timeout_sec = 600
new_env = os.environ.copy()
new_env["HOME"] = self.user_home_dir
chrome_args = [self.executable,
"--use-mock-keychain", # mac thing
chrome_args = [
self.executable, "--use-mock-keychain", # mac thing
"--user-data-dir={}".format(self.user_data_dir),
"--remote-debugging-port={}".format(self.port),
"--disable-web-sockets", "--disable-cache",
"--window-size=1100,900", "--no-default-browser-check",
"--disable-first-run-ui", "--no-first-run",
"--homepage=about:blank", "--disable-direct-npapi-requests",
"--disable-web-security"]
"--disable-web-security", "--disable-notifications",
"--disable-save-password-bubble"]
if self.ignore_cert_errors:
chrome_args.append("--ignore-certificate-errors")
if self.proxy:

View File

@ -20,7 +20,7 @@ import setuptools
import glob
setuptools.setup(name='brozzler',
version='1.1.dev7',
version='1.1.dev8',
description='Distributed web crawling with browsers',
url='https://github.com/nlevitt/brozzler',
author='Noah Levitt',