diff --git a/README.rst b/README.rst index 3cd2ba3..cf184e7 100644 --- a/README.rst +++ b/README.rst @@ -157,7 +157,7 @@ Next install the build tools and fetch the source code: mkdir -p ~/chromium cd ~/chromium git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git - export $PATH=$PWD/depot_tools:$PATH + export PATH=$PWD/depot_tools:$PATH fetch --no-history chromium --nosvn=True Configure a headless release build (the debug builds are much larger): @@ -195,13 +195,13 @@ option: chmod +x ~/bin/headless_chromium.sh brozzler-worker --chrome-exe ~/bin/headless_chromium.sh -The Pepper Flash plugin ``libpepflashplayer.so`` from an official Google Chrome -release may be used with Headless Chromium by adding this option to the wrapper -script: +To render Flash content, `download `_ +and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium +to load the plugin by adding this option to your wrapper script: :: - --register-pepper-plugins=/opt/google/chrome/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash + --register-pepper-plugins="/opt/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash" License ------- diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 8acee2d..02cc9b2 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -19,6 +19,7 @@ limitations under the License. import json as _json import logging as _logging +import surt as _surt from pkg_resources import get_distribution as _get_distribution __version__ = _get_distribution('brozzler').version @@ -64,6 +65,16 @@ class BaseDictable: def __repr__(self): return "{}(**{})".format(self.__class__.__name__, self.to_dict()) +def fixup(url): + ''' + Does rudimentary canonicalization, such as converting IDN to punycode. + ''' + hurl = _surt.handyurl.parse(url) + # handyurl.parse() already lowercases the scheme via urlsplit + if hurl.host: + hurl.host = hurl.host.encode('idna').decode('ascii').lower() + return hurl.getURLString() + # logging level more fine-grained than logging.DEBUG==10 TRACE = 5 diff --git a/brozzler/behaviors.d/instagram.js b/brozzler/behaviors.d/instagram.js index ef4759d..e1ad3e4 100644 --- a/brozzler/behaviors.d/instagram.js +++ b/brozzler/behaviors.d/instagram.js @@ -16,115 +16,30 @@ * limitations under the License. */ + var umbraInstagramBehavior = { IDLE_TIMEOUT_SEC: 20, idleSince: null, - state: "loading-thumbs", - imageCount: null, - bigImagesLoaded: 0, - currentBigImage: null, - previousBigImage: null, intervalFunc: function() { - if (this.state === "loading-thumbs") { - if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { - window.scrollBy(0, 200); - this.idleSince = null; - return; - } - - var moreButtons = document.querySelectorAll(".PhotoGridMoreButton:not(.pgmbDisabled)"); - if (moreButtons.length > 0) { - console.log("clicking load more button"); - moreButtons[0].click(); - this.idleSince = null; - return; - } - - if (this.idleSince == null) { - console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()"); - this.idleSince = Date.now(); - return; - } else { - var doneButtons = document.querySelectorAll(".PhotoGridMoreButton.pgmbDisabled"); - if (Date.now() - this.idleSince > 9000 || (doneButtons.length > 0 && doneButtons[0].innerText === "All items loaded") ) { - console.log("finished loading-thumbs, it appears we have reached the bottom"); - this.state = "clicking-first-thumb"; - this.idleSince = null; - return; - } else { - // console.log("still might be waiting for something to load..."); - return; - } - } - } - - if (this.state === "clicking-first-thumb") { - var images = document.querySelectorAll("a.pgmiImageLink"); - if (images && images !== "undefined") { - this.imageCount = images.length; - if (images.length > 0) { - console.log("clicking first thumbnail"); - images[0].click(); - this.idleSince = null; - this.state = "waiting-big-image"; - return; - } - } - - console.log("no big images to load?"); - this.idleSince = Date.now(); + if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + window.scrollBy(0, 200); + this.idleSince = null; return; } - if (this.state === "waiting-big-image") { - if(this.currentBigImage == null) { - var imageFrame = document.querySelectorAll("div.Modal div.Item div.iMedia div.Image"); - if (imageFrame.length > 0 && imageFrame[0].getAttribute("src") !== this.previousBigImage ) { - this.currentBigImage = new Image(); - this.currentBigImage.src = imageFrame[0].getAttribute("src"); - //console.log("this.currentBigImage.naturalWidth=" + this.currentBigImage.naturalWidth + " this.currentBigImage.src=" + this.currentBigImage.src); - return; - } else if(this.idleSince == null ) { - console.log("waiting for image frame to load"); - this.idleSince = Date.now(); - return; - } - } else if (this.currentBigImage.src !== this.previousBigImage && this.currentBigImage.naturalWidth !== 0) { - console.log("next big image appears loaded, will click right arrow next time"); - this.state = "click-next-big-image"; - this.previousBigImage = this.currentBigImage.src; - this.currentBigImage = null; - this.bigImagesLoaded++; - this.idleSince = null; - - if (this.bigImagesLoaded >= this.imageCount) { - console.log("looks like we're done, we've loaded all " + this.bigImagesLoaded + " of " + this.imageCount + " big images"); - this.state = "finished"; - this.idleSince = Date.now(); - } - return; - } else if(this.idleSince == null) { - console.log("Waiting for big image to load"); - this.idleSince = Date.now(); - return; - } - + var moreButtons = document.querySelectorAll("a._oidfu"); + if (moreButtons.length > 0) { + console.log("clicking load more button"); + moreButtons[0].click(); + this.idleSince = null; + return; } - if (this.state === "click-next-big-image") { - var rightArrow = document.querySelectorAll("a.mmRightArrow"); - if (rightArrow.length > 0) { - // console.log("clicking right arrow"); - rightArrow[0].click(); - this.state = "waiting-big-image"; - this.idleSince = null; - return; - } else { - console.warn("no right arrow to click?? weird"); - this.idleSince = Date.now(); - return; - } + if (this.idleSince == null) { + console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()"); + this.idleSince = Date.now(); + return; } }, diff --git a/brozzler/behaviors.d/mouseovers.js.template b/brozzler/behaviors.d/mouseovers.js.template new file mode 100644 index 0000000..f4d6173 --- /dev/null +++ b/brozzler/behaviors.d/mouseovers.js.template @@ -0,0 +1,136 @@ +/* + * brozzler/behaviors.d/mouseovers.js.in - mouseovers behavior template, + * mouseovers on elements matching templatized css selector + * + * Copyright (C) 2014-2016 Internet Archive + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +var umbraBehavior = { + IDLE_TIMEOUT_SEC : 10, + idleSince : null, + alreadyMouseovered : {}, + + intervalFunc : function() { + var mouseoveredSomething = false; + var somethingLeftBelow = false; + var somethingLeftAbove = false; + var cssSelector = "${mouseover_css_selector}"; + var mouseoverUntilTimeout = "${mouseover_until_hard_timeout}"; + + //handle Python to JavaScript boolean conversion + mouseoverUntilTimeout == "True" ? mouseoverUntilTimeout = true : mouseoverUntilTimeout = false; + + var iframes = document.querySelectorAll("iframe"); + var documents = Array(iframes.length + 1); + documents[0] = document; + + for (var i = 0; i < iframes.length; i++) { + documents[i+1] = iframes[i].contentWindow.document; + } + + for (var j = 0; j < documents.length; j++) { + + var mouseoverTargets = documents[j].querySelectorAll(cssSelector); + + for ( var i = 0; i < mouseoverTargets.length; i++) { + if (mouseoverTargets[i].umbraMouseovered && !mouseoverUntilTimeout) { + continue; + } + + var where = this.aboveBelowOrOnScreen(mouseoverTargets[i]); + + if (where == 0) { + console.log("mouseovering on " + mouseoverTargets[i].outerHTML); + // do mouse over event on mouseover target + // since some urls are requsted only on + // this event - see + // https://webarchive.jira.com/browse/AITFIVE-451 + var mouseOverEvent = document.createEvent('Events'); + mouseOverEvent.initEvent("mouseover",true, false); + mouseoverTargets[i].dispatchEvent(mouseOverEvent); + mouseoveredSomething = true; + this.idleSince = null; + mouseoverTargets[i].umbraMouseovered = true; + + break; //break from mouseoverTargets loop, but not from iframe loop + } else if (where > 0) { + somethingLeftBelow = true; + } else if (where < 0) { + somethingLeftAbove = true; + } + } + } + + if (!mouseoveredSomething) { + if (somethingLeftAbove) { + // console.log("scrolling UP because everything on this screen has been mouseovered but we missed something above"); + window.scrollBy(0, -500); + this.idleSince = null; + } else if (somethingLeftBelow) { + // console.log("scrolling because everything on this screen has been mouseovered but there's more below document.body.clientHeight=" + // + document.body.clientHeight); + window.scrollBy(0, 200); + this.idleSince = null; + } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + // console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + // + document.body.clientHeight); + window.scrollBy(0, 200); + this.idleSince = null; + } else if (this.idleSince == null) { + this.idleSince = Date.now(); + } + } + + if (!this.idleSince) { + this.idleSince = Date.now(); + } + }, + + start : function() { + var that = this; + this.intervalId = setInterval(function() { + that.intervalFunc() + }, 250); + }, + + isFinished : function() { + if (this.idleSince != null) { + var idleTimeMs = Date.now() - this.idleSince; + if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) { + clearInterval(this.intervalId); + return true; + } + } + return false; + }, + + aboveBelowOrOnScreen : function(e) { + var eTop = e.getBoundingClientRect().top; + if (eTop < window.scrollY) { + return -1; // above + } else if (eTop > window.scrollY + window.innerHeight) { + return 1; // below + } else { + return 0; // on screen + } + }, +}; + +// Called from outside of this script. +var umbraBehaviorFinished = function() { + return umbraBehavior.isFinished() +}; + +umbraBehavior.start(); diff --git a/brozzler/behaviors.yaml b/brozzler/behaviors.yaml index 90322e6..73566cc 100644 --- a/brozzler/behaviors.yaml +++ b/brozzler/behaviors.yaml @@ -104,6 +104,11 @@ behaviors: url_regex: '^https?://(?:www\.)?fec.gov/data/.*$' behavior_js: fec_gov.js request_idle_timeout_sec: 10 + - url_regex: '^https?://(?:www\.)?news\.com\.au/.*$' + behavior_js_template: mouseovers.js.template + default_parameters: + mouseover_css_selector: .menu-item a + request_idle_timeout_sec: 10 - # default fallback behavior url_regex: '^.*$' request_idle_timeout_sec: 10 diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index 0f23c17..4bea483 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -1,5 +1,7 @@ id: - type: string + type: + - string + - integer required: true <<: &multi_level_options @@ -79,4 +81,4 @@ seeds: type: url required: true - <<: *multi_level_options \ No newline at end of file + <<: *multi_level_options diff --git a/brozzler/site.py b/brozzler/site.py index f474d9b..1dcd90f 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -54,11 +54,15 @@ class Url: return self._host def matches_ip_or_domain(self, ip_or_domain): - """Returns true if - - ip_or_domain is an ip address and self.host is the same ip address - - ip_or_domain is a domain and self.host is the same domain - - ip_or_domain is a domain and self.host is a subdomain of it """ + Returns true if + - ip_or_domain is an ip address and self.host is the same ip address + - ip_or_domain is a domain and self.host is the same domain + - ip_or_domain is a domain and self.host is a subdomain of it + """ + if not self.host: + return False + if ip_or_domain == self.host: return True diff --git a/brozzler/worker.py b/brozzler/worker.py index cd6237a..2b3e745 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -53,7 +53,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler): self.reset() def _http_response(self, request, response): - self.transactions.append(YoutubeDLSpy.Transaction(request,response)) + self.transactions.append(YoutubeDLSpy.Transaction(request, response)) return response http_response = https_response = _http_response @@ -175,11 +175,11 @@ class BrozzlerWorker: try: with urllib.request.urlopen(request) as response: - if response.status != 204: + if response.getcode() != 204: self.logger.warn( 'got "%s %s" response on warcprox ' 'WARCPROX_WRITE_RECORD request (expected 204)', - response.status, response.reason) + response.getcode(), response.reason) except urllib.error.HTTPError as e: self.logger.warn( 'got "%s %s" response on warcprox ' @@ -197,7 +197,8 @@ class BrozzlerWorker: "with youtube-dl json for %s", page) self._warcprox_write_record( warcprox_address=self._proxy(site), - url="youtube-dl:%s" % page.url, warc_type="metadata", + url="youtube-dl:%s" % brozzler.fixup(page.url), + warc_type="metadata", content_type="application/vnd.youtube-dl_formats+json;charset=utf-8", payload=info_json.encode("utf-8"), extra_headers=site.extra_headers()) @@ -237,12 +238,12 @@ class BrozzlerWorker: screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs( screenshot_png) self._warcprox_write_record(warcprox_address=self._proxy(site), - url="screenshot:{}".format(page.url), + url="screenshot:%s" % brozzler.fixup(page.url), warc_type="resource", content_type="image/jpeg", payload=screenshot_jpeg, extra_headers=site.extra_headers()) self._warcprox_write_record(warcprox_address=self._proxy(site), - url="thumbnail:{}".format(page.url), + url="thumbnail:%s" % brozzler.fixup(page.url), warc_type="resource", content_type="image/jpeg", payload=thumbnail_jpeg, extra_headers=site.extra_headers()) @@ -311,7 +312,7 @@ class BrozzlerWorker: def _already_fetched(self, page, brozzler_spy): for txn in brozzler_spy.final_bounces(page.url): if (txn.request.get_method() == 'GET' - and txn.response.status == 200): + and txn.response.getcode() == 200): return True return False diff --git a/setup.py b/setup.py index babd7c8..8d3fb71 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b7.dev105', + version='1.1b7.dev109', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt',