Merge branch 'instagram' into qa

This commit is contained in:
Barbara Miller 2016-11-03 15:49:26 -07:00
commit 65a46f4558
9 changed files with 192 additions and 118 deletions

View File

@ -157,7 +157,7 @@ Next install the build tools and fetch the source code:
mkdir -p ~/chromium
cd ~/chromium
git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git
export $PATH=$PWD/depot_tools:$PATH
export PATH=$PWD/depot_tools:$PATH
fetch --no-history chromium --nosvn=True
Configure a headless release build (the debug builds are much larger):
@ -195,13 +195,13 @@ option:
chmod +x ~/bin/headless_chromium.sh
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
The Pepper Flash plugin ``libpepflashplayer.so`` from an official Google Chrome
release may be used with Headless Chromium by adding this option to the wrapper
script:
To render Flash content, `download <https://get.adobe.com/flashplayer/otherversions/>`_
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
to load the plugin by adding this option to your wrapper script:
::
--register-pepper-plugins=/opt/google/chrome/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash
--register-pepper-plugins="/opt/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash"
License
-------

View File

@ -19,6 +19,7 @@ limitations under the License.
import json as _json
import logging as _logging
import surt as _surt
from pkg_resources import get_distribution as _get_distribution
__version__ = _get_distribution('brozzler').version
@ -64,6 +65,16 @@ class BaseDictable:
def __repr__(self):
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
def fixup(url):
'''
Does rudimentary canonicalization, such as converting IDN to punycode.
'''
hurl = _surt.handyurl.parse(url)
# handyurl.parse() already lowercases the scheme via urlsplit
if hurl.host:
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
return hurl.getURLString()
# logging level more fine-grained than logging.DEBUG==10
TRACE = 5

View File

@ -16,115 +16,30 @@
* limitations under the License.
*/
var umbraInstagramBehavior = {
IDLE_TIMEOUT_SEC: 20,
idleSince: null,
state: "loading-thumbs",
imageCount: null,
bigImagesLoaded: 0,
currentBigImage: null,
previousBigImage: null,
intervalFunc: function() {
if (this.state === "loading-thumbs") {
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
window.scrollBy(0, 200);
this.idleSince = null;
return;
}
var moreButtons = document.querySelectorAll(".PhotoGridMoreButton:not(.pgmbDisabled)");
if (moreButtons.length > 0) {
console.log("clicking load more button");
moreButtons[0].click();
this.idleSince = null;
return;
}
if (this.idleSince == null) {
console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
this.idleSince = Date.now();
return;
} else {
var doneButtons = document.querySelectorAll(".PhotoGridMoreButton.pgmbDisabled");
if (Date.now() - this.idleSince > 9000 || (doneButtons.length > 0 && doneButtons[0].innerText === "All items loaded") ) {
console.log("finished loading-thumbs, it appears we have reached the bottom");
this.state = "clicking-first-thumb";
this.idleSince = null;
return;
} else {
// console.log("still might be waiting for something to load...");
return;
}
}
}
if (this.state === "clicking-first-thumb") {
var images = document.querySelectorAll("a.pgmiImageLink");
if (images && images !== "undefined") {
this.imageCount = images.length;
if (images.length > 0) {
console.log("clicking first thumbnail");
images[0].click();
this.idleSince = null;
this.state = "waiting-big-image";
return;
}
}
console.log("no big images to load?");
this.idleSince = Date.now();
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
window.scrollBy(0, 200);
this.idleSince = null;
return;
}
if (this.state === "waiting-big-image") {
if(this.currentBigImage == null) {
var imageFrame = document.querySelectorAll("div.Modal div.Item div.iMedia div.Image");
if (imageFrame.length > 0 && imageFrame[0].getAttribute("src") !== this.previousBigImage ) {
this.currentBigImage = new Image();
this.currentBigImage.src = imageFrame[0].getAttribute("src");
//console.log("this.currentBigImage.naturalWidth=" + this.currentBigImage.naturalWidth + " this.currentBigImage.src=" + this.currentBigImage.src);
return;
} else if(this.idleSince == null ) {
console.log("waiting for image frame to load");
this.idleSince = Date.now();
return;
}
} else if (this.currentBigImage.src !== this.previousBigImage && this.currentBigImage.naturalWidth !== 0) {
console.log("next big image appears loaded, will click right arrow next time");
this.state = "click-next-big-image";
this.previousBigImage = this.currentBigImage.src;
this.currentBigImage = null;
this.bigImagesLoaded++;
this.idleSince = null;
if (this.bigImagesLoaded >= this.imageCount) {
console.log("looks like we're done, we've loaded all " + this.bigImagesLoaded + " of " + this.imageCount + " big images");
this.state = "finished";
this.idleSince = Date.now();
}
return;
} else if(this.idleSince == null) {
console.log("Waiting for big image to load");
this.idleSince = Date.now();
return;
}
var moreButtons = document.querySelectorAll("a._oidfu");
if (moreButtons.length > 0) {
console.log("clicking load more button");
moreButtons[0].click();
this.idleSince = null;
return;
}
if (this.state === "click-next-big-image") {
var rightArrow = document.querySelectorAll("a.mmRightArrow");
if (rightArrow.length > 0) {
// console.log("clicking right arrow");
rightArrow[0].click();
this.state = "waiting-big-image";
this.idleSince = null;
return;
} else {
console.warn("no right arrow to click?? weird");
this.idleSince = Date.now();
return;
}
if (this.idleSince == null) {
console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
this.idleSince = Date.now();
return;
}
},

View File

@ -0,0 +1,136 @@
/*
* brozzler/behaviors.d/mouseovers.js.in - mouseovers behavior template,
* mouseovers on elements matching templatized css selector
*
* Copyright (C) 2014-2016 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraBehavior = {
IDLE_TIMEOUT_SEC : 10,
idleSince : null,
alreadyMouseovered : {},
intervalFunc : function() {
var mouseoveredSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var cssSelector = "${mouseover_css_selector}";
var mouseoverUntilTimeout = "${mouseover_until_hard_timeout}";
//handle Python to JavaScript boolean conversion
mouseoverUntilTimeout == "True" ? mouseoverUntilTimeout = true : mouseoverUntilTimeout = false;
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);
documents[0] = document;
for (var i = 0; i < iframes.length; i++) {
documents[i+1] = iframes[i].contentWindow.document;
}
for (var j = 0; j < documents.length; j++) {
var mouseoverTargets = documents[j].querySelectorAll(cssSelector);
for ( var i = 0; i < mouseoverTargets.length; i++) {
if (mouseoverTargets[i].umbraMouseovered && !mouseoverUntilTimeout) {
continue;
}
var where = this.aboveBelowOrOnScreen(mouseoverTargets[i]);
if (where == 0) {
console.log("mouseovering on " + mouseoverTargets[i].outerHTML);
// do mouse over event on mouseover target
// since some urls are requsted only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
mouseoverTargets[i].dispatchEvent(mouseOverEvent);
mouseoveredSomething = true;
this.idleSince = null;
mouseoverTargets[i].umbraMouseovered = true;
break; //break from mouseoverTargets loop, but not from iframe loop
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}
if (!mouseoveredSomething) {
if (somethingLeftAbove) {
// console.log("scrolling UP because everything on this screen has been mouseovered but we missed something above");
window.scrollBy(0, -500);
this.idleSince = null;
} else if (somethingLeftBelow) {
// console.log("scrolling because everything on this screen has been mouseovered but there's more below document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (this.idleSince == null) {
this.idleSince = Date.now();
}
}
if (!this.idleSince) {
this.idleSince = Date.now();
}
},
start : function() {
var that = this;
this.intervalId = setInterval(function() {
that.intervalFunc()
}, 250);
},
isFinished : function() {
if (this.idleSince != null) {
var idleTimeMs = Date.now() - this.idleSince;
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
clearInterval(this.intervalId);
return true;
}
}
return false;
},
aboveBelowOrOnScreen : function(e) {
var eTop = e.getBoundingClientRect().top;
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
return 1; // below
} else {
return 0; // on screen
}
},
};
// Called from outside of this script.
var umbraBehaviorFinished = function() {
return umbraBehavior.isFinished()
};
umbraBehavior.start();

View File

@ -104,6 +104,11 @@ behaviors:
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
behavior_js: fec_gov.js
request_idle_timeout_sec: 10
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
behavior_js_template: mouseovers.js.template
default_parameters:
mouseover_css_selector: .menu-item a
request_idle_timeout_sec: 10
- # default fallback behavior
url_regex: '^.*$'
request_idle_timeout_sec: 10

View File

@ -1,5 +1,7 @@
id:
type: string
type:
- string
- integer
required: true
<<: &multi_level_options
@ -79,4 +81,4 @@ seeds:
type: url
required: true
<<: *multi_level_options
<<: *multi_level_options

View File

@ -54,11 +54,15 @@ class Url:
return self._host
def matches_ip_or_domain(self, ip_or_domain):
"""Returns true if
- ip_or_domain is an ip address and self.host is the same ip address
- ip_or_domain is a domain and self.host is the same domain
- ip_or_domain is a domain and self.host is a subdomain of it
"""
Returns true if
- ip_or_domain is an ip address and self.host is the same ip address
- ip_or_domain is a domain and self.host is the same domain
- ip_or_domain is a domain and self.host is a subdomain of it
"""
if not self.host:
return False
if ip_or_domain == self.host:
return True

View File

@ -53,7 +53,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
self.reset()
def _http_response(self, request, response):
self.transactions.append(YoutubeDLSpy.Transaction(request,response))
self.transactions.append(YoutubeDLSpy.Transaction(request, response))
return response
http_response = https_response = _http_response
@ -175,11 +175,11 @@ class BrozzlerWorker:
try:
with urllib.request.urlopen(request) as response:
if response.status != 204:
if response.getcode() != 204:
self.logger.warn(
'got "%s %s" response on warcprox '
'WARCPROX_WRITE_RECORD request (expected 204)',
response.status, response.reason)
response.getcode(), response.reason)
except urllib.error.HTTPError as e:
self.logger.warn(
'got "%s %s" response on warcprox '
@ -197,7 +197,8 @@ class BrozzlerWorker:
"with youtube-dl json for %s", page)
self._warcprox_write_record(
warcprox_address=self._proxy(site),
url="youtube-dl:%s" % page.url, warc_type="metadata",
url="youtube-dl:%s" % brozzler.fixup(page.url),
warc_type="metadata",
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
payload=info_json.encode("utf-8"),
extra_headers=site.extra_headers())
@ -237,12 +238,12 @@ class BrozzlerWorker:
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
screenshot_png)
self._warcprox_write_record(warcprox_address=self._proxy(site),
url="screenshot:{}".format(page.url),
url="screenshot:%s" % brozzler.fixup(page.url),
warc_type="resource", content_type="image/jpeg",
payload=screenshot_jpeg,
extra_headers=site.extra_headers())
self._warcprox_write_record(warcprox_address=self._proxy(site),
url="thumbnail:{}".format(page.url),
url="thumbnail:%s" % brozzler.fixup(page.url),
warc_type="resource", content_type="image/jpeg",
payload=thumbnail_jpeg,
extra_headers=site.extra_headers())
@ -311,7 +312,7 @@ class BrozzlerWorker:
def _already_fetched(self, page, brozzler_spy):
for txn in brozzler_spy.final_bounces(page.url):
if (txn.request.get_method() == 'GET'
and txn.response.status == 200):
and txn.response.getcode() == 200):
return True
return False

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b7.dev105',
version='1.1b7.dev109',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',