mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Merge branch 'master' into AITFIVE-832
* master: copy over latest behaviors and stuff from umbra support for host rules in outlink scoping recover from rethinkdb error updating service registry
This commit is contained in:
commit
31356d526a
206
brozzler/behaviors.d/facebook.js.template
Normal file
206
brozzler/behaviors.d/facebook.js.template
Normal file
@ -0,0 +1,206 @@
|
|||||||
|
/*
|
||||||
|
* brozzler/behaviors.d/facebook.js - facebook behavior, scrolls to the bottom
|
||||||
|
* of the page, clicks to expand images, a few other things
|
||||||
|
*
|
||||||
|
* Copyright (C) 2014-2016 Internet Archive
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
var umbraAboveBelowOrOnScreen = function(e) {
|
||||||
|
var eTop = e.getBoundingClientRect().top;
|
||||||
|
if (eTop < window.scrollY) {
|
||||||
|
return -1; // above
|
||||||
|
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||||
|
// if (e.clientWidth != 0) {
|
||||||
|
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be below the screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
|
||||||
|
// }
|
||||||
|
return 1; // below
|
||||||
|
} else {
|
||||||
|
// if (e.clientWidth != 0) {
|
||||||
|
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be on screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
|
||||||
|
// }
|
||||||
|
return 0; // on screen
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
|
||||||
|
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
|
||||||
|
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
|
||||||
|
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
|
||||||
|
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
|
||||||
|
var UMBRA_FB_USER_NAME = "${parameter_username}";
|
||||||
|
var UMBRA_FB_PASSWORD = "${parameter_password}";
|
||||||
|
var umbraAlreadyClicked = {};
|
||||||
|
var umbraAlreadyScrolledThing = {};
|
||||||
|
var umbraScrolledThingFailedScrollAttempts = {};
|
||||||
|
var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0};
|
||||||
|
|
||||||
|
var umbraIntervalFunc = function() {
|
||||||
|
|
||||||
|
var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR);
|
||||||
|
var everythingScrolled = true;
|
||||||
|
|
||||||
|
for (var i = 0; i < thingsToScroll.length; i++) {
|
||||||
|
var target = thingsToScroll[i];
|
||||||
|
|
||||||
|
if (!(target in umbraAlreadyScrolledThing)) {
|
||||||
|
|
||||||
|
everythingScrolled = false;
|
||||||
|
|
||||||
|
console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id);
|
||||||
|
var lastScrollTop = target.scrollTop;
|
||||||
|
target.scrollTop = target.scrollHeight;
|
||||||
|
|
||||||
|
umbraState.idleSince = null;
|
||||||
|
|
||||||
|
if (target.scrollTop >= target.scrollHeight) {
|
||||||
|
umbraAlreadyScrolledThing[target] = true;
|
||||||
|
}
|
||||||
|
else if (target.scrollTop == lastScrollTop) {
|
||||||
|
if (umbraScrolledThingFailedScrollAttempts[target]) {
|
||||||
|
umbraScrolledThingFailedScrollAttempts[target]++;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
umbraScrolledThingFailedScrollAttempts[target] = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) {
|
||||||
|
umbraAlreadyScrolledThing[target] = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
//reset failed count on a successful scroll
|
||||||
|
umbraScrolledThingFailedScrollAttempts[target] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id)
|
||||||
|
}
|
||||||
|
|
||||||
|
umbraState.expectingSomething = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) {
|
||||||
|
if (umbraState.idleSince == null) {
|
||||||
|
umbraState.idleSince = Date.now();
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"], div.fbPhotoSnowlift.fbxPhoto a._xlt');
|
||||||
|
for (var i = 0; i < closeButtons.length; i++) {
|
||||||
|
// XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible
|
||||||
|
if (closeButtons[i].clientWidth > 0) {
|
||||||
|
if (umbraState.expectingSomething == 'closeButton') {
|
||||||
|
console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML);
|
||||||
|
umbraState.expectingSomething = null;
|
||||||
|
} else {
|
||||||
|
console.warn("found UNexpected close button, umbraState.expectingSomething=" + umbraState.expectingSomething + " ... clicking on it " + closeButtons[i].outerHTML);
|
||||||
|
}
|
||||||
|
closeButtons[i].click();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (umbraState.expectingSomething == 'closeButton') {
|
||||||
|
console.log("waiting for close button, haven't seen it yet");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var thingsToClick = document.querySelectorAll(UMBRA_THINGS_TO_CLICK_SELECTOR);
|
||||||
|
var clickedSomething = false;
|
||||||
|
var somethingLeftBelow = false;
|
||||||
|
var somethingLeftAbove = false;
|
||||||
|
var missedAbove = 0;
|
||||||
|
|
||||||
|
for (var i = 0; i < thingsToClick.length; i++) {
|
||||||
|
var target = thingsToClick[i];
|
||||||
|
if (!(target in umbraAlreadyClicked)) {
|
||||||
|
var where = umbraAboveBelowOrOnScreen(target);
|
||||||
|
if (where == 0) { // on screen
|
||||||
|
// var pos = target.getBoundingClientRect().top;
|
||||||
|
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
||||||
|
console.log("clicking at " + target.getBoundingClientRect().top + " on " + target.outerHTML);
|
||||||
|
if (target.click != undefined) {
|
||||||
|
umbraState.expectingSomething = 'closeButton';
|
||||||
|
target.click();
|
||||||
|
}
|
||||||
|
target.style.border = '1px solid #0a0';
|
||||||
|
umbraAlreadyClicked[target] = true;
|
||||||
|
clickedSomething = true;
|
||||||
|
umbraState.idleSince = null;
|
||||||
|
break;
|
||||||
|
} else if (where > 0) {
|
||||||
|
somethingLeftBelow = true;
|
||||||
|
} else if (where < 0) {
|
||||||
|
somethingLeftAbove = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (window.scrollY > umbraState.bottomReachedScrollY) {
|
||||||
|
umbraState.bottomReachedScrollY = window.scrollY;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!clickedSomething) {
|
||||||
|
if (somethingLeftBelow) {
|
||||||
|
// console.log("scrolling down because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 300);
|
||||||
|
umbraState.idleSince = null;
|
||||||
|
} else if (umbraState.bottomReachedScrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||||
|
// console.log("scrolling down because we haven't reached the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 300);
|
||||||
|
umbraState.idleSince = null;
|
||||||
|
} else if (somethingLeftAbove) {
|
||||||
|
// console.log("scrolling UP because we've already been to the bottom, everything on or below this screen has been clicked, but we missed something above");
|
||||||
|
window.scrollBy(0, -600);
|
||||||
|
umbraState.idleSince = null;
|
||||||
|
} else if (umbraState.idleSince == null) {
|
||||||
|
umbraState.idleSince = Date.now();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var umbraFacebookLogin = function() {
|
||||||
|
var emailInput = document.querySelector("form#login_form input#email");
|
||||||
|
var passwordInput = document.querySelector("form#login_form input#pass");
|
||||||
|
var loginButton = document.querySelector("form#login_form label#loginbutton > input");
|
||||||
|
emailInput.value=UMBRA_FB_USER_NAME;
|
||||||
|
passwordInput.value=UMBRA_FB_PASSWORD;
|
||||||
|
loginButton.click();
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||||
|
// time, then we consider ourselves finished with the page.
|
||||||
|
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||||
|
|
||||||
|
// Called from outside of this script.
|
||||||
|
var umbraBehaviorFinished = function() {
|
||||||
|
|
||||||
|
if (umbraState.idleSince != null) {
|
||||||
|
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||||
|
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters
|
||||||
|
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
|
||||||
|
}
|
||||||
|
else //login
|
||||||
|
umbraFacebookLogin();
|
||||||
|
|
||||||
|
|
@ -42,43 +42,49 @@ class Behavior:
|
|||||||
conf = yaml.load(fin)
|
conf = yaml.load(fin)
|
||||||
Behavior._behaviors = conf['behaviors']
|
Behavior._behaviors = conf['behaviors']
|
||||||
|
|
||||||
simpleclicks_js_in = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + ["simpleclicks.js.in"])
|
|
||||||
with open(simpleclicks_js_in) as fin:
|
|
||||||
simpleclicks_js_template = string.Template(fin.read())
|
|
||||||
|
|
||||||
for behavior in Behavior._behaviors:
|
for behavior in Behavior._behaviors:
|
||||||
if "behavior_js" in behavior:
|
if "behavior_js" in behavior:
|
||||||
behavior_js = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js"]])
|
behavior_js = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js"]])
|
||||||
behavior["script"] = open(behavior_js, encoding="utf-8").read()
|
with open(behavior_js, encoding="utf-8") as fin:
|
||||||
elif "click_css_selector" in behavior:
|
behavior["script"] = fin.read()
|
||||||
if "click_until_hard_timeout" in behavior:
|
elif "behavior_js_template" in behavior:
|
||||||
click_until_hard_timeout_value=behavior["click_until_hard_timeout"]
|
behavior_js_template = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js_template"]])
|
||||||
else:
|
with open(behavior_js_template, encoding="utf-8") as fin:
|
||||||
click_until_hard_timeout_value = False
|
behavior["template"] = string.Template(fin.read())
|
||||||
behavior["script"] = simpleclicks_js_template.substitute(click_css_selector=behavior["click_css_selector"], click_until_hard_timeout=click_until_hard_timeout_value)
|
|
||||||
|
|
||||||
return Behavior._behaviors
|
return Behavior._behaviors
|
||||||
|
|
||||||
def __init__(self, url, umbra_worker):
|
def __init__(self, url, umbra_worker):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.umbra_worker = umbra_worker
|
self.umbra_worker = umbra_worker
|
||||||
|
|
||||||
self.script_finished = False
|
self.script_finished = False
|
||||||
self.waiting_result_msg_ids = []
|
self.waiting_result_msg_ids = []
|
||||||
self.active_behavior = None
|
self.active_behavior = None
|
||||||
self.last_activity = time.time()
|
self.last_activity = time.time()
|
||||||
|
|
||||||
def start(self):
|
def start(self, template_parameters=None):
|
||||||
for behavior in Behavior.behaviors():
|
for behavior in Behavior.behaviors():
|
||||||
if re.match(behavior['url_regex'], self.url):
|
if re.match(behavior['url_regex'], self.url):
|
||||||
if "behavior_js" in behavior:
|
if "behavior_js" in behavior:
|
||||||
self.logger.info("using {} behavior for {}".format(behavior["behavior_js"], self.url))
|
self.logger.info("using %s behavior for %s",
|
||||||
elif "click_css_selector" in behavior:
|
behavior["behavior_js"], self.url)
|
||||||
self.logger.info("using simple click behavior with css selector {} for {}".format(behavior["click_css_selector"], self.url))
|
elif "behavior_js_template" in behavior:
|
||||||
|
parameters = dict()
|
||||||
|
if "default_parameters" in behavior:
|
||||||
|
parameters.update(behavior["default_parameters"])
|
||||||
|
if template_parameters:
|
||||||
|
parameters.update(template_parameters)
|
||||||
|
behavior["script"] = behavior["template"].safe_substitute(parameters)
|
||||||
|
|
||||||
|
self.logger.info(
|
||||||
|
"using template=%s populated with parameters=%s for %s",
|
||||||
|
repr(behavior["behavior_js_template"]),
|
||||||
|
parameters, self.url)
|
||||||
|
|
||||||
self.active_behavior = behavior
|
self.active_behavior = behavior
|
||||||
self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
|
self.umbra_worker.send_to_chrome(
|
||||||
suppress_logging=True, params={"expression": behavior["script"]})
|
method="Runtime.evaluate", suppress_logging=True,
|
||||||
|
params={"expression": behavior["script"]})
|
||||||
self.notify_of_activity()
|
self.notify_of_activity()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
@ -20,7 +20,10 @@
|
|||||||
behaviors:
|
behaviors:
|
||||||
-
|
-
|
||||||
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
|
url_regex: '^https?://(?:www\.)?facebook\.com/.*$'
|
||||||
behavior_js: facebook.js
|
behavior_js_template: facebook.js.template
|
||||||
|
# default_parameters:
|
||||||
|
# parameter_username: jdoe@example.com
|
||||||
|
# parameter_password: abcd1234
|
||||||
request_idle_timeout_sec: 30
|
request_idle_timeout_sec: 30
|
||||||
-
|
-
|
||||||
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
|
url_regex: '^https?://(?:www\.)?flickr\.com/.*$'
|
||||||
@ -42,36 +45,50 @@ behaviors:
|
|||||||
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
|
url_regex: '^https?://(?:www\.)?instagram\.com/.*$'
|
||||||
behavior_js: instagram.js
|
behavior_js: instagram.js
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
-
|
-
|
||||||
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
|
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
|
||||||
click_css_selector: img.img-responsive
|
behavior_js_template: simpleclicks.js.template
|
||||||
|
default_parameters:
|
||||||
|
click_css_selector: img.img-responsive
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # acalog https://webarchive.jira.com/browse/ARI-3775
|
- # acalog https://webarchive.jira.com/browse/ARI-3775
|
||||||
url_regex: '^https?://.*[?&]catoid=[^?]*$'
|
url_regex: '^https?://.*[?&]catoid=[^?]*$'
|
||||||
click_css_selector: a[onclick]
|
behavior_js_template: simpleclicks.js.template
|
||||||
|
default_parameters:
|
||||||
|
click_css_selector: a[onclick]
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # https://webarchive.jira.com/browse/ARI-3956
|
- # https://webarchive.jira.com/browse/ARI-3956
|
||||||
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
|
url_regex: '^https?://(?:www\.)?usask.ca/.*$'
|
||||||
click_css_selector: a[id='feature-next']
|
behavior_js_template: simpleclicks.js.template
|
||||||
|
default_parameters:
|
||||||
|
click_css_selector: a[id='feature-next']
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # https://webarchive.jira.com/browse/AITFIVE-451
|
- # https://webarchive.jira.com/browse/AITFIVE-451
|
||||||
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
|
url_regex: '^https?://(?:www\.)?soundcloud.com/.*$'
|
||||||
click_css_selector: button.sc-button-play, button.playButton
|
behavior_js_template: simpleclicks.js.template
|
||||||
|
default_parameters:
|
||||||
|
click_css_selector: button.sc-button-play, button.playButton
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # https://webarchive.jira.com/browse/AITFIVE-463
|
- # https://webarchive.jira.com/browse/AITFIVE-463
|
||||||
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
|
url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$'
|
||||||
click_css_selector: button.playButton.medium
|
behavior_js_template: simpleclicks.js.template
|
||||||
|
default_parameters:
|
||||||
|
click_css_selector: button.playButton.medium
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # https://webarchive.jira.com/browse/ARI-4690
|
- # https://webarchive.jira.com/browse/ARI-4690
|
||||||
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
|
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
|
||||||
click_css_selector: span.load-more-text
|
behavior_js_template: simpleclicks.js.template
|
||||||
|
default_parameters:
|
||||||
|
click_css_selector: span.load-more-text
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # https://webarchive.jira.com/browse/ARI-4725
|
- # https://webarchive.jira.com/browse/ARI-4725
|
||||||
url_regex: '^https?://(?:www\.)?moma.org/.*$'
|
url_regex: '^https?://(?:www\.)?moma.org/.*$'
|
||||||
click_css_selector: button[data-more-results-bottom-button]
|
behavior_js_template: simpleclicks.js.template
|
||||||
click_until_hard_timeout: True
|
default_parameters:
|
||||||
request_idle_timeout_sec: 10
|
click_css_selector: button[data-more-results-bottom-button]
|
||||||
- # default fallback brhavior
|
click_until_hard_timeout: True
|
||||||
|
request_idle_timeout_sec: 10
|
||||||
|
- # default fallback behavior
|
||||||
url_regex: '^.*$'
|
url_regex: '^.*$'
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
behavior_js: default.js
|
behavior_js: default.js
|
||||||
|
@ -157,8 +157,10 @@ class Browser:
|
|||||||
def abort_browse_page(self):
|
def abort_browse_page(self):
|
||||||
self._abort_browse_page = True
|
self._abort_browse_page = True
|
||||||
|
|
||||||
def browse_page(self, url, extra_headers=None, on_request=None,
|
def browse_page(
|
||||||
on_screenshot=None, on_url_change=None):
|
self, url, extra_headers=None, behavior_parameters=None,
|
||||||
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
|
on_url_change=None):
|
||||||
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
"""Synchronously loads a page, takes a screenshot, and runs behaviors.
|
||||||
|
|
||||||
Raises BrowsingException if browsing the page fails in a non-critical
|
Raises BrowsingException if browsing the page fails in a non-critical
|
||||||
@ -173,6 +175,8 @@ class Browser:
|
|||||||
self.on_request = on_request
|
self.on_request = on_request
|
||||||
self.on_screenshot = on_screenshot
|
self.on_screenshot = on_screenshot
|
||||||
self.on_url_change = on_url_change
|
self.on_url_change = on_url_change
|
||||||
|
self.on_response = on_response
|
||||||
|
self.behavior_parameters = behavior_parameters
|
||||||
|
|
||||||
self._waiting_on_scroll_to_top_msg_id = None
|
self._waiting_on_scroll_to_top_msg_id = None
|
||||||
self._waiting_on_screenshot_msg_id = None
|
self._waiting_on_screenshot_msg_id = None
|
||||||
@ -313,10 +317,15 @@ class Browser:
|
|||||||
def _network_response_received(self, message):
|
def _network_response_received(self, message):
|
||||||
if (not self._reached_limit
|
if (not self._reached_limit
|
||||||
and message["params"]["response"]["status"] == 420
|
and message["params"]["response"]["status"] == 420
|
||||||
and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])):
|
and "Warcprox-Meta" in CaseInsensitiveDict(
|
||||||
warcprox_meta = json.loads(CaseInsensitiveDict(message["params"]["response"]["headers"])["Warcprox-Meta"])
|
message["params"]["response"]["headers"])):
|
||||||
self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta)
|
warcprox_meta = json.loads(CaseInsensitiveDict(
|
||||||
|
message["params"]["response"]["headers"])["Warcprox-Meta"])
|
||||||
|
self._reached_limit = brozzler.ReachedLimit(
|
||||||
|
warcprox_meta=warcprox_meta)
|
||||||
self.logger.info("reached limit %s", self._reached_limit)
|
self.logger.info("reached limit %s", self._reached_limit)
|
||||||
|
if self.on_response:
|
||||||
|
self.on_response(message)
|
||||||
|
|
||||||
def _page_load_event_fired(self, message):
|
def _page_load_event_fired(self, message):
|
||||||
self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
|
self.logger.info("Page.loadEventFired, moving on to starting behaviors url={}".format(self.url))
|
||||||
@ -363,10 +372,8 @@ class Browser:
|
|||||||
elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
|
elif self._behavior and self._behavior.is_waiting_on_result(message["id"]):
|
||||||
self._behavior.notify_of_result(message)
|
self._behavior.notify_of_result(message)
|
||||||
|
|
||||||
def _handle_message(self, websock, message):
|
def _handle_message(self, websock, json_message):
|
||||||
# self.logger.debug("message from {} - {}".format(websock.url, message[:95]))
|
message = json.loads(json_message)
|
||||||
# self.logger.debug("message from {} - {}".format(websock.url, message))
|
|
||||||
message = json.loads(message)
|
|
||||||
if "method" in message and message["method"] == "Network.requestWillBeSent":
|
if "method" in message and message["method"] == "Network.requestWillBeSent":
|
||||||
self._network_request_will_be_sent(message)
|
self._network_request_will_be_sent(message)
|
||||||
elif "method" in message and message["method"] == "Network.responseReceived":
|
elif "method" in message and message["method"] == "Network.responseReceived":
|
||||||
@ -384,9 +391,9 @@ class Browser:
|
|||||||
# elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"):
|
# elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"):
|
||||||
# pass
|
# pass
|
||||||
# elif "method" in message:
|
# elif "method" in message:
|
||||||
# self.logger.debug("{} {}".format(message["method"], message))
|
# self.logger.debug("{} {}".format(message["method"], json_message))
|
||||||
# else:
|
# else:
|
||||||
# self.logger.debug("[no-method] {}".format(message))
|
# self.logger.debug("[no-method] {}".format(json_message))
|
||||||
|
|
||||||
class Chrome:
|
class Chrome:
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
@ -412,15 +419,16 @@ class Chrome:
|
|||||||
timeout_sec = 600
|
timeout_sec = 600
|
||||||
new_env = os.environ.copy()
|
new_env = os.environ.copy()
|
||||||
new_env["HOME"] = self.user_home_dir
|
new_env["HOME"] = self.user_home_dir
|
||||||
chrome_args = [self.executable,
|
chrome_args = [
|
||||||
"--use-mock-keychain", # mac thing
|
self.executable, "--use-mock-keychain", # mac thing
|
||||||
"--user-data-dir={}".format(self.user_data_dir),
|
"--user-data-dir={}".format(self.user_data_dir),
|
||||||
"--remote-debugging-port={}".format(self.port),
|
"--remote-debugging-port={}".format(self.port),
|
||||||
"--disable-web-sockets", "--disable-cache",
|
"--disable-web-sockets", "--disable-cache",
|
||||||
"--window-size=1100,900", "--no-default-browser-check",
|
"--window-size=1100,900", "--no-default-browser-check",
|
||||||
"--disable-first-run-ui", "--no-first-run",
|
"--disable-first-run-ui", "--no-first-run",
|
||||||
"--homepage=about:blank", "--disable-direct-npapi-requests",
|
"--homepage=about:blank", "--disable-direct-npapi-requests",
|
||||||
"--disable-web-security"]
|
"--disable-web-security", "--disable-notifications",
|
||||||
|
"--disable-save-password-bubble"]
|
||||||
if self.ignore_cert_errors:
|
if self.ignore_cert_errors:
|
||||||
chrome_args.append("--ignore-certificate-errors")
|
chrome_args.append("--ignore-certificate-errors")
|
||||||
if self.proxy:
|
if self.proxy:
|
||||||
|
@ -258,10 +258,10 @@ class RethinkDbFrontier:
|
|||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
surt_ = brozzler.site.to_surt(url)
|
u = brozzler.site.Url(url)
|
||||||
if site.is_in_scope(url, surt_=surt_, parent_page=parent_page):
|
if site.is_in_scope(u, parent_page=parent_page):
|
||||||
if brozzler.is_permitted_by_robots(site, url):
|
if brozzler.is_permitted_by_robots(site, url):
|
||||||
if not surt_.startswith(site.scope["surt"]):
|
if not u.surt.startswith(site.scope["surt"]):
|
||||||
hops_off_surt = parent_page.hops_off_surt + 1
|
hops_off_surt = parent_page.hops_off_surt + 1
|
||||||
else:
|
else:
|
||||||
hops_off_surt = 0
|
hops_off_surt = 0
|
||||||
|
155
brozzler/site.py
155
brozzler/site.py
@ -25,8 +25,62 @@ import time
|
|||||||
import rethinkstuff
|
import rethinkstuff
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
import re
|
||||||
|
import ipaddress
|
||||||
|
|
||||||
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(tzinfo=rethinkstuff.UTC)
|
_EPOCH_UTC = datetime.datetime.utcfromtimestamp(0.0).replace(
|
||||||
|
tzinfo=rethinkstuff.UTC)
|
||||||
|
|
||||||
|
class Url:
|
||||||
|
def __init__(self, url):
|
||||||
|
self.url = url
|
||||||
|
self._surt = None
|
||||||
|
self._host = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def surt(self):
|
||||||
|
if not self._surt:
|
||||||
|
hurl = surt.handyurl.parse(self.url)
|
||||||
|
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
||||||
|
hurl.query = None
|
||||||
|
hurl.hash = None
|
||||||
|
# XXX chop off path after last slash??
|
||||||
|
self._surt = hurl.getURLString(surt=True, trailing_comma=True)
|
||||||
|
return self._surt
|
||||||
|
|
||||||
|
@property
|
||||||
|
def host(self):
|
||||||
|
if not self._host:
|
||||||
|
self._host = surt.handyurl.parse(self.url).host
|
||||||
|
return self._host
|
||||||
|
|
||||||
|
def matches_ip_or_domain(self, ip_or_domain):
|
||||||
|
"""Returns true if
|
||||||
|
- ip_or_domain is an ip address and self.host is the same ip address
|
||||||
|
- ip_or_domain is a domain and self.host is the same domain
|
||||||
|
- ip_or_domain is a domain and self.host is a subdomain of it
|
||||||
|
"""
|
||||||
|
if ip_or_domain == self.host:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# if either ip_or_domain or self.host are ip addresses, and they're not
|
||||||
|
# identical (previous check), not a match
|
||||||
|
try:
|
||||||
|
ipaddress.ip_address(ip_or_domain)
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
ipaddress.ip_address(self.host)
|
||||||
|
return False
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# if we get here, we're looking at two hostnames
|
||||||
|
# XXX do we need to handle case of one punycoded idn, other not?
|
||||||
|
domain_parts = ip_or_domain.split(".")
|
||||||
|
host_parts = self.host.split(".")
|
||||||
|
|
||||||
|
return host_parts[-len(domain_parts):] == domain_parts
|
||||||
|
|
||||||
class Site(brozzler.BaseDictable):
|
class Site(brozzler.BaseDictable):
|
||||||
logger = logging.getLogger(__module__ + "." + __qualname__)
|
logger = logging.getLogger(__module__ + "." + __qualname__)
|
||||||
@ -58,7 +112,7 @@ class Site(brozzler.BaseDictable):
|
|||||||
|
|
||||||
self.scope = scope or {}
|
self.scope = scope or {}
|
||||||
if not "surt" in self.scope:
|
if not "surt" in self.scope:
|
||||||
self.scope["surt"] = self._to_surt(seed)
|
self.scope["surt"] = Url(seed).surt
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
|
return """Site(id={},seed={},scope={},proxy={},enable_warcprox_features={},ignore_robots={},extra_headers={},reached_limit={})""".format(
|
||||||
@ -69,72 +123,95 @@ class Site(brozzler.BaseDictable):
|
|||||||
def __str__(self):
|
def __str__(self):
|
||||||
return "Site-%s-%s" % (self.id, self.seed)
|
return "Site-%s-%s" % (self.id, self.seed)
|
||||||
|
|
||||||
def _to_surt(self, url):
|
|
||||||
hurl = surt.handyurl.parse(url)
|
|
||||||
surt.GoogleURLCanonicalizer.canonicalize(hurl)
|
|
||||||
hurl.query = None
|
|
||||||
hurl.hash = None
|
|
||||||
# XXX chop off path after last slash??
|
|
||||||
return hurl.getURLString(surt=True, trailing_comma=True)
|
|
||||||
|
|
||||||
def note_seed_redirect(self, url):
|
def note_seed_redirect(self, url):
|
||||||
new_scope_surt = self._to_surt(url)
|
new_scope_surt = Url(url).surt
|
||||||
if not new_scope_surt.startswith(self.scope["surt"]):
|
if not new_scope_surt.startswith(self.scope["surt"]):
|
||||||
self.logger.info("changing site scope surt from {} to {}".format(
|
self.logger.info("changing site scope surt from {} to {}".format(
|
||||||
self.scope["surt"], new_scope_surt))
|
self.scope["surt"], new_scope_surt))
|
||||||
self.scope["surt"] = new_scope_surt
|
self.scope["surt"] = new_scope_surt
|
||||||
|
|
||||||
def is_in_scope(self, url, surt_=None, parent_page=None):
|
def is_in_scope(self, url, parent_page=None):
|
||||||
if not surt_:
|
if not isinstance(url, Url):
|
||||||
surt_ = to_surt(url)
|
u = Url(url)
|
||||||
might_accept = False
|
else:
|
||||||
|
u = url
|
||||||
|
|
||||||
if not surt_.startswith("http://") and not surt_.startswith("https://"):
|
might_accept = False
|
||||||
|
if not u.surt.startswith("http://") and not u.surt.startswith("https://"):
|
||||||
# XXX doesn't belong here maybe (where? worker ignores unknown
|
# XXX doesn't belong here maybe (where? worker ignores unknown
|
||||||
# schemes?)
|
# schemes?)
|
||||||
return False
|
return False
|
||||||
elif (parent_page and "max_hops" in self.scope
|
elif (parent_page and "max_hops" in self.scope
|
||||||
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
and parent_page.hops_from_seed >= self.scope["max_hops"]):
|
||||||
pass
|
pass
|
||||||
elif surt_.startswith(self.scope["surt"]):
|
elif u.surt.startswith(self.scope["surt"]):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
elif parent_page and parent_page.hops_off_surt < self.scope.get(
|
||||||
"max_hops_off_surt", 0):
|
"max_hops_off_surt", 0):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
elif "accepts" in self.scope:
|
elif "accepts" in self.scope:
|
||||||
for rule in self.scope["accepts"]:
|
for rule in self.scope["accepts"]:
|
||||||
if self._scope_rule_applies(rule, url, surt_):
|
if self._scope_rule_applies(rule, u):
|
||||||
might_accept = True
|
might_accept = True
|
||||||
|
break
|
||||||
|
|
||||||
if might_accept:
|
if might_accept:
|
||||||
if "blocks" in self.scope:
|
if "blocks" in self.scope:
|
||||||
for rule in self.scope["blocks"]:
|
for rule in self.scope["blocks"]:
|
||||||
if self._scope_rule_applies(rule, url, surt_):
|
if self._scope_rule_applies(rule, u):
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def _scope_rule_applies(self, rule, url, surt_):
|
def _scope_rule_applies(self, rule, url):
|
||||||
if not "url_match" in rule or not "value" in rule:
|
"""
|
||||||
self.logger.warn("unable to make sense of scope rule %s", rule)
|
Examples of valid rules:
|
||||||
return False
|
[
|
||||||
if rule["url_match"] == "STRING_MATCH":
|
{
|
||||||
return url.find(rule["value"]) >= 0
|
"host": "monkey.org",
|
||||||
elif rule["url_match"] == "REGEX_MATCH":
|
"url_match": "STRING_MATCH",
|
||||||
try:
|
"value": "bar",
|
||||||
return re.fullmatch(rule["value"], url)
|
},
|
||||||
except Exception as e:
|
{
|
||||||
self.logger.warn(
|
"url_match": "SURT_MATCH",
|
||||||
"caught exception matching against regex %s: %s",
|
"value": "+http://(com,woop,)/fuh/",
|
||||||
rule["value"], e)
|
},
|
||||||
return False
|
{
|
||||||
elif rule["url_match"] == "SURT_MATCH":
|
"host": "badhost.com",
|
||||||
return surt_.startswith(rule["value"])
|
},
|
||||||
|
]
|
||||||
|
"""
|
||||||
|
if not isinstance(url, Url):
|
||||||
|
u = Url(url)
|
||||||
else:
|
else:
|
||||||
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
u = url
|
||||||
return False
|
|
||||||
|
|
||||||
|
if "host" in rule and not u.matches_ip_or_domain(rule["host"]):
|
||||||
|
return False
|
||||||
|
if "url_match" in rule:
|
||||||
|
if rule["url_match"] == "STRING_MATCH":
|
||||||
|
return u.url.find(rule["value"]) >= 0
|
||||||
|
elif rule["url_match"] == "REGEX_MATCH":
|
||||||
|
try:
|
||||||
|
return re.fullmatch(rule["value"], u.url)
|
||||||
|
except Exception as e:
|
||||||
|
self.logger.warn(
|
||||||
|
"caught exception matching against regex %s: %s",
|
||||||
|
rule["value"], e)
|
||||||
|
return False
|
||||||
|
elif rule["url_match"] == "SURT_MATCH":
|
||||||
|
return u.surt.startswith(rule["value"])
|
||||||
|
else:
|
||||||
|
self.logger.warn("invalid rule.url_match=%s", rule.url_match)
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
if "host" in rule:
|
||||||
|
# we already know that it matches from earlier check
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self.logger.warn("unable to make sense of scope rule %s", rule)
|
||||||
|
return False
|
||||||
|
|
||||||
class Page(brozzler.BaseDictable):
|
class Page(brozzler.BaseDictable):
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -183,7 +260,3 @@ class Page(brozzler.BaseDictable):
|
|||||||
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
surt.GoogleURLCanonicalizer.canonicalize(self._canon_hurl)
|
||||||
return self._canon_hurl.geturl()
|
return self._canon_hurl.geturl()
|
||||||
|
|
||||||
def to_surt(url):
|
|
||||||
hurl = surt.handyurl.parse(url)
|
|
||||||
return surt.GoogleURLCanonicalizer.canonicalize(
|
|
||||||
hurl).getURLString(surt=True, trailing_comma=True)
|
|
||||||
|
@ -34,6 +34,7 @@ import socket
|
|||||||
import datetime
|
import datetime
|
||||||
import collections
|
import collections
|
||||||
import requests
|
import requests
|
||||||
|
import rethinkstuff
|
||||||
|
|
||||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||||
def __init__(self, extra_headers):
|
def __init__(self, extra_headers):
|
||||||
@ -312,14 +313,20 @@ class BrozzlerWorker:
|
|||||||
status_info["browser_pool_size"] = self._browser_pool.size
|
status_info["browser_pool_size"] = self._browser_pool.size
|
||||||
status_info["browsers_in_use"] = self._browser_pool.num_in_use()
|
status_info["browsers_in_use"] = self._browser_pool.num_in_use()
|
||||||
|
|
||||||
self.status_info = self._service_registry.heartbeat(status_info)
|
try:
|
||||||
self.logger.debug("status in service registry: %s", self.status_info)
|
self.status_info = self._service_registry.heartbeat(status_info)
|
||||||
|
self.logger.debug(
|
||||||
|
"status in service registry: %s", self.status_info)
|
||||||
|
except rethinkdb.ReqlError as e:
|
||||||
|
self.logger.error(
|
||||||
|
"failed to send heartbeat and update service registry "
|
||||||
|
"with info %s: %s", status_info, e)
|
||||||
|
|
||||||
def run(self):
|
def run(self):
|
||||||
try:
|
try:
|
||||||
latest_state = None
|
latest_state = None
|
||||||
while not self._shutdown_requested.is_set():
|
while not self._shutdown_requested.is_set():
|
||||||
if self._service_registry and (not hasattr(self, "status_info") or (datetime.datetime.now(datetime.timezone.utc) - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
|
if self._service_registry and (not hasattr(self, "status_info") or (rethinkstuff.utcnow() - self.status_info["last_heartbeat"]).total_seconds() > self.HEARTBEAT_INTERVAL):
|
||||||
self._service_heartbeat()
|
self._service_heartbeat()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
2
setup.py
2
setup.py
@ -20,7 +20,7 @@ import setuptools
|
|||||||
import glob
|
import glob
|
||||||
|
|
||||||
setuptools.setup(name='brozzler',
|
setuptools.setup(name='brozzler',
|
||||||
version='1.1.dev6',
|
version='1.1.dev8',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/nlevitt/brozzler',
|
url='https://github.com/nlevitt/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user