diff --git a/brozzler/behaviors.d/facebook.js.template b/brozzler/behaviors.d/facebook.js.template new file mode 100644 index 0000000..fc29e66 --- /dev/null +++ b/brozzler/behaviors.d/facebook.js.template @@ -0,0 +1,206 @@ +/* + * brozzler/behaviors.d/facebook.js - facebook behavior, scrolls to the bottom + * of the page, clicks to expand images, a few other things + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var umbraAboveBelowOrOnScreen = function(e) { + var eTop = e.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + // if (e.clientWidth != 0) { + // console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be below the screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e); + // } + return 1; // below + } else { + // if (e.clientWidth != 0) { + // console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be on screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e); + // } + return 0; // on screen + } +} + +// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText' +var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]'; +//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages +var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]'; +var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5; +var UMBRA_FB_USER_NAME = "${parameter_username}"; +var UMBRA_FB_PASSWORD = "${parameter_password}"; +var umbraAlreadyClicked = {}; +var umbraAlreadyScrolledThing = {}; +var umbraScrolledThingFailedScrollAttempts = {}; +var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0}; + +var umbraIntervalFunc = function() { + + var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR); + var everythingScrolled = true; + + for (var i = 0; i < thingsToScroll.length; i++) { + var target = thingsToScroll[i]; + + if (!(target in umbraAlreadyScrolledThing)) { + + everythingScrolled = false; + + console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id); + var lastScrollTop = target.scrollTop; + target.scrollTop = target.scrollHeight; + + umbraState.idleSince = null; + + if (target.scrollTop >= target.scrollHeight) { + umbraAlreadyScrolledThing[target] = true; + } + else if (target.scrollTop == lastScrollTop) { + if (umbraScrolledThingFailedScrollAttempts[target]) { + umbraScrolledThingFailedScrollAttempts[target]++; + } + else { + umbraScrolledThingFailedScrollAttempts[target] = 1; + } + + if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) { + umbraAlreadyScrolledThing[target] = true; + } + } + else { + //reset failed count on a successful scroll + umbraScrolledThingFailedScrollAttempts[target] = 0; + } + } + else { + console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id) + } + + umbraState.expectingSomething = null; + } + + if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) { + if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } + + return; + } + + var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater, a[aria-label="Press Esc to close"], div.fbPhotoSnowlift.fbxPhoto a._xlt'); + for (var i = 0; i < closeButtons.length; i++) { + // XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible + if (closeButtons[i].clientWidth > 0) { + if (umbraState.expectingSomething == 'closeButton') { + console.log("found expected close button, clicking on it " + closeButtons[i].outerHTML); + umbraState.expectingSomething = null; + } else { + console.warn("found UNexpected close button, umbraState.expectingSomething=" + umbraState.expectingSomething + " ... clicking on it " + closeButtons[i].outerHTML); + } + closeButtons[i].click(); + return; + } + } + if (umbraState.expectingSomething == 'closeButton') { + console.log("waiting for close button, haven't seen it yet"); + return; + } + + var thingsToClick = document.querySelectorAll(UMBRA_THINGS_TO_CLICK_SELECTOR); + var clickedSomething = false; + var somethingLeftBelow = false; + var somethingLeftAbove = false; + var missedAbove = 0; + + for (var i = 0; i < thingsToClick.length; i++) { + var target = thingsToClick[i]; + if (!(target in umbraAlreadyClicked)) { + var where = umbraAboveBelowOrOnScreen(target); + if (where == 0) { // on screen + // var pos = target.getBoundingClientRect().top; + // window.scrollTo(0, target.getBoundingClientRect().top - 100); + console.log("clicking at " + target.getBoundingClientRect().top + " on " + target.outerHTML); + if (target.click != undefined) { + umbraState.expectingSomething = 'closeButton'; + target.click(); + } + target.style.border = '1px solid #0a0'; + umbraAlreadyClicked[target] = true; + clickedSomething = true; + umbraState.idleSince = null; + break; + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + } + + if (window.scrollY > umbraState.bottomReachedScrollY) { + umbraState.bottomReachedScrollY = window.scrollY; + } + + if (!clickedSomething) { + if (somethingLeftBelow) { + // console.log("scrolling down because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight); + window.scrollBy(0, 300); + umbraState.idleSince = null; + } else if (umbraState.bottomReachedScrollY + window.innerHeight < document.documentElement.scrollHeight) { + // console.log("scrolling down because we haven't reached the bottom yet document.body.clientHeight=" + document.body.clientHeight); + window.scrollBy(0, 300); + umbraState.idleSince = null; + } else if (somethingLeftAbove) { + // console.log("scrolling UP because we've already been to the bottom, everything on or below this screen has been clicked, but we missed something above"); + window.scrollBy(0, -600); + umbraState.idleSince = null; + } else if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } + } +} + +var umbraFacebookLogin = function() { + var emailInput = document.querySelector("form#login_form input#email"); + var passwordInput = document.querySelector("form#login_form input#pass"); + var loginButton = document.querySelector("form#login_form label#loginbutton > input"); + emailInput.value=UMBRA_FB_USER_NAME; + passwordInput.value=UMBRA_FB_PASSWORD; + loginButton.click(); +} + +// If we haven't had anything to do (scrolled, clicked, etc) in this amount of +// time, then we consider ourselves finished with the page. +var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10; + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + + if (umbraState.idleSince != null) { + var idleTimeMs = Date.now() - umbraState.idleSince; + if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { + return true; + } + } + return false; +} + +if (document.getElementById("login_form") == null || UMBRA_FB_USER_NAME.indexOf("parameter")>0 || UMBRA_FB_PASSWORD.indexOf("parameter")>0 ) {//check for unset parameters + var umbraIntervalId = setInterval(umbraIntervalFunc, 200); +} +else //login + umbraFacebookLogin(); + + diff --git a/brozzler/behaviors.d/simpleclicks.js.in b/brozzler/behaviors.d/simpleclicks.js.template similarity index 100% rename from brozzler/behaviors.d/simpleclicks.js.in rename to brozzler/behaviors.d/simpleclicks.js.template diff --git a/brozzler/behaviors.py b/brozzler/behaviors.py index 62de529..ccfc881 100644 --- a/brozzler/behaviors.py +++ b/brozzler/behaviors.py @@ -42,43 +42,49 @@ class Behavior: conf = yaml.load(fin) Behavior._behaviors = conf['behaviors'] - simpleclicks_js_in = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + ["simpleclicks.js.in"]) - with open(simpleclicks_js_in) as fin: - simpleclicks_js_template = string.Template(fin.read()) - for behavior in Behavior._behaviors: if "behavior_js" in behavior: behavior_js = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js"]]) - behavior["script"] = open(behavior_js, encoding="utf-8").read() - elif "click_css_selector" in behavior: - if "click_until_hard_timeout" in behavior: - click_until_hard_timeout_value=behavior["click_until_hard_timeout"] - else: - click_until_hard_timeout_value = False - behavior["script"] = simpleclicks_js_template.substitute(click_css_selector=behavior["click_css_selector"], click_until_hard_timeout=click_until_hard_timeout_value) + with open(behavior_js, encoding="utf-8") as fin: + behavior["script"] = fin.read() + elif "behavior_js_template" in behavior: + behavior_js_template = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ["behaviors.d"] + [behavior["behavior_js_template"]]) + with open(behavior_js_template, encoding="utf-8") as fin: + behavior["template"] = string.Template(fin.read()) return Behavior._behaviors def __init__(self, url, umbra_worker): self.url = url self.umbra_worker = umbra_worker - self.script_finished = False self.waiting_result_msg_ids = [] self.active_behavior = None self.last_activity = time.time() - def start(self): + def start(self, template_parameters=None): for behavior in Behavior.behaviors(): if re.match(behavior['url_regex'], self.url): if "behavior_js" in behavior: - self.logger.info("using {} behavior for {}".format(behavior["behavior_js"], self.url)) - elif "click_css_selector" in behavior: - self.logger.info("using simple click behavior with css selector {} for {}".format(behavior["click_css_selector"], self.url)) + self.logger.info("using %s behavior for %s", + behavior["behavior_js"], self.url) + elif "behavior_js_template" in behavior: + parameters = dict() + if "default_parameters" in behavior: + parameters.update(behavior["default_parameters"]) + if template_parameters: + parameters.update(template_parameters) + behavior["script"] = behavior["template"].safe_substitute(parameters) + + self.logger.info( + "using template=%s populated with parameters=%s for %s", + repr(behavior["behavior_js_template"]), + parameters, self.url) self.active_behavior = behavior - self.umbra_worker.send_to_chrome(method="Runtime.evaluate", - suppress_logging=True, params={"expression": behavior["script"]}) + self.umbra_worker.send_to_chrome( + method="Runtime.evaluate", suppress_logging=True, + params={"expression": behavior["script"]}) self.notify_of_activity() return diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 7d980d1..13af540 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -20,7 +20,10 @@ behaviors: - url_regex: '^https?://(?:www\.)?facebook\.com/.*$' - behavior_js: facebook.js + behavior_js_template: facebook.js.template + # default_parameters: + # parameter_username: jdoe@example.com + # parameter_password: abcd1234 request_idle_timeout_sec: 30 - url_regex: '^https?://(?:www\.)?flickr\.com/.*$' @@ -42,36 +45,50 @@ behaviors: url_regex: '^https?://(?:www\.)?instagram\.com/.*$' behavior_js: instagram.js request_idle_timeout_sec: 10 - - + - url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' - click_css_selector: img.img-responsive + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: img.img-responsive request_idle_timeout_sec: 10 - # acalog https://webarchive.jira.com/browse/ARI-3775 url_regex: '^https?://.*[?&]catoid=[^?]*$' - click_css_selector: a[onclick] + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: a[onclick] request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-3956 url_regex: '^https?://(?:www\.)?usask.ca/.*$' - click_css_selector: a[id='feature-next'] + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: a[id='feature-next'] request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/AITFIVE-451 url_regex: '^https?://(?:www\.)?soundcloud.com/.*$' - click_css_selector: button.sc-button-play, button.playButton + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button.sc-button-play, button.playButton request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/AITFIVE-463 url_regex: '^https?://(?:www\.)?christophercerrone.com/.*$' - click_css_selector: button.playButton.medium + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button.playButton.medium request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-4690 url_regex: '^https?://(?:www\.)?youtube.com/.*$' - click_css_selector: span.load-more-text + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: span.load-more-text request_idle_timeout_sec: 10 - # https://webarchive.jira.com/browse/ARI-4725 url_regex: '^https?://(?:www\.)?moma.org/.*$' - click_css_selector: button[data-more-results-bottom-button] - click_until_hard_timeout: True - request_idle_timeout_sec: 10 - - # default fallback brhavior + behavior_js_template: simpleclicks.js.template + default_parameters: + click_css_selector: button[data-more-results-bottom-button] + click_until_hard_timeout: True + request_idle_timeout_sec: 10 + - # default fallback behavior url_regex: '^.*$' request_idle_timeout_sec: 10 behavior_js: default.js diff --git a/brozzler/browser.py b/brozzler/browser.py index 4344a0c..3b16486 100644 --- a/brozzler/browser.py +++ b/brozzler/browser.py @@ -157,8 +157,10 @@ class Browser: def abort_browse_page(self): self._abort_browse_page = True - def browse_page(self, url, extra_headers=None, on_request=None, - on_screenshot=None, on_url_change=None): + def browse_page( + self, url, extra_headers=None, behavior_parameters=None, + on_request=None, on_response=None, on_screenshot=None, + on_url_change=None): """Synchronously loads a page, takes a screenshot, and runs behaviors. Raises BrowsingException if browsing the page fails in a non-critical @@ -173,6 +175,8 @@ class Browser: self.on_request = on_request self.on_screenshot = on_screenshot self.on_url_change = on_url_change + self.on_response = on_response + self.behavior_parameters = behavior_parameters self._waiting_on_screenshot_msg_id = None self._waiting_on_document_url_msg_id = None @@ -301,10 +305,15 @@ class Browser: def _network_response_received(self, message): if (not self._reached_limit and message["params"]["response"]["status"] == 420 - and "Warcprox-Meta" in CaseInsensitiveDict(message["params"]["response"]["headers"])): - warcprox_meta = json.loads(CaseInsensitiveDict(message["params"]["response"]["headers"])["Warcprox-Meta"]) - self._reached_limit = brozzler.ReachedLimit(warcprox_meta=warcprox_meta) + and "Warcprox-Meta" in CaseInsensitiveDict( + message["params"]["response"]["headers"])): + warcprox_meta = json.loads(CaseInsensitiveDict( + message["params"]["response"]["headers"])["Warcprox-Meta"]) + self._reached_limit = brozzler.ReachedLimit( + warcprox_meta=warcprox_meta) self.logger.info("reached limit %s", self._reached_limit) + if self.on_response: + self.on_response(message) def _page_load_event_fired(self, message): self.logger.info("Page.loadEventFired, requesting screenshot url={} message={}".format(self.url, message)) @@ -335,7 +344,7 @@ class Browser: self._waiting_on_screenshot_msg_id = None self.logger.info("got screenshot, moving on to starting behaviors url={}".format(self.url)) self._behavior = Behavior(self.url, self) - self._behavior.start() + self._behavior.start(self.behavior_parameters) elif message["id"] == self._waiting_on_outlinks_msg_id: self.logger.debug("got outlinks message=%s", message) self._outlinks = frozenset(message["result"]["result"]["value"].split(" ")) @@ -347,10 +356,8 @@ class Browser: elif self._behavior and self._behavior.is_waiting_on_result(message["id"]): self._behavior.notify_of_result(message) - def _handle_message(self, websock, message): - # self.logger.debug("message from {} - {}".format(websock.url, message[:95])) - # self.logger.debug("message from {} - {}".format(websock.url, message)) - message = json.loads(message) + def _handle_message(self, websock, json_message): + message = json.loads(json_message) if "method" in message and message["method"] == "Network.requestWillBeSent": self._network_request_will_be_sent(message) elif "method" in message and message["method"] == "Network.responseReceived": @@ -368,9 +375,9 @@ class Browser: # elif "method" in message and message["method"] in ("Network.dataReceived", "Network.responseReceived", "Network.loadingFinished"): # pass # elif "method" in message: - # self.logger.debug("{} {}".format(message["method"], message)) + # self.logger.debug("{} {}".format(message["method"], json_message)) # else: - # self.logger.debug("[no-method] {}".format(message)) + # self.logger.debug("[no-method] {}".format(json_message)) class Chrome: logger = logging.getLogger(__module__ + "." + __qualname__) @@ -396,15 +403,16 @@ class Chrome: timeout_sec = 600 new_env = os.environ.copy() new_env["HOME"] = self.user_home_dir - chrome_args = [self.executable, - "--use-mock-keychain", # mac thing + chrome_args = [ + self.executable, "--use-mock-keychain", # mac thing "--user-data-dir={}".format(self.user_data_dir), "--remote-debugging-port={}".format(self.port), "--disable-web-sockets", "--disable-cache", "--window-size=1100,900", "--no-default-browser-check", "--disable-first-run-ui", "--no-first-run", "--homepage=about:blank", "--disable-direct-npapi-requests", - "--disable-web-security"] + "--disable-web-security", "--disable-notifications", + "--disable-save-password-bubble"] if self.ignore_cert_errors: chrome_args.append("--ignore-certificate-errors") if self.proxy: diff --git a/setup.py b/setup.py index fdf118b..f1aff19 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,7 @@ import setuptools import glob setuptools.setup(name='brozzler', - version='1.1.dev7', + version='1.1.dev8', description='Distributed web crawling with browsers', url='https://github.com/nlevitt/brozzler', author='Noah Levitt',