mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Merge branch 'instagram' into qa
This commit is contained in:
commit
65a46f4558
10
README.rst
10
README.rst
@ -157,7 +157,7 @@ Next install the build tools and fetch the source code:
|
||||
mkdir -p ~/chromium
|
||||
cd ~/chromium
|
||||
git clone https://chromium.googlesource.com/chromium/tools/depot_tools.git
|
||||
export $PATH=$PWD/depot_tools:$PATH
|
||||
export PATH=$PWD/depot_tools:$PATH
|
||||
fetch --no-history chromium --nosvn=True
|
||||
|
||||
Configure a headless release build (the debug builds are much larger):
|
||||
@ -195,13 +195,13 @@ option:
|
||||
chmod +x ~/bin/headless_chromium.sh
|
||||
brozzler-worker --chrome-exe ~/bin/headless_chromium.sh
|
||||
|
||||
The Pepper Flash plugin ``libpepflashplayer.so`` from an official Google Chrome
|
||||
release may be used with Headless Chromium by adding this option to the wrapper
|
||||
script:
|
||||
To render Flash content, `download <https://get.adobe.com/flashplayer/otherversions/>`_
|
||||
and extract the Linux (.tar.gz) PPAPI plugin. Configure Headless Chromium
|
||||
to load the plugin by adding this option to your wrapper script:
|
||||
|
||||
::
|
||||
|
||||
--register-pepper-plugins=/opt/google/chrome/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash
|
||||
--register-pepper-plugins="/opt/PepperFlash/libpepflashplayer.so;application/x-shockwave-flash"
|
||||
|
||||
License
|
||||
-------
|
||||
|
@ -19,6 +19,7 @@ limitations under the License.
|
||||
|
||||
import json as _json
|
||||
import logging as _logging
|
||||
import surt as _surt
|
||||
from pkg_resources import get_distribution as _get_distribution
|
||||
|
||||
__version__ = _get_distribution('brozzler').version
|
||||
@ -64,6 +65,16 @@ class BaseDictable:
|
||||
def __repr__(self):
|
||||
return "{}(**{})".format(self.__class__.__name__, self.to_dict())
|
||||
|
||||
def fixup(url):
|
||||
'''
|
||||
Does rudimentary canonicalization, such as converting IDN to punycode.
|
||||
'''
|
||||
hurl = _surt.handyurl.parse(url)
|
||||
# handyurl.parse() already lowercases the scheme via urlsplit
|
||||
if hurl.host:
|
||||
hurl.host = hurl.host.encode('idna').decode('ascii').lower()
|
||||
return hurl.getURLString()
|
||||
|
||||
# logging level more fine-grained than logging.DEBUG==10
|
||||
TRACE = 5
|
||||
|
||||
|
@ -16,115 +16,30 @@
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
var umbraInstagramBehavior = {
|
||||
IDLE_TIMEOUT_SEC: 20,
|
||||
idleSince: null,
|
||||
state: "loading-thumbs",
|
||||
imageCount: null,
|
||||
bigImagesLoaded: 0,
|
||||
currentBigImage: null,
|
||||
previousBigImage: null,
|
||||
|
||||
intervalFunc: function() {
|
||||
if (this.state === "loading-thumbs") {
|
||||
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
return;
|
||||
}
|
||||
|
||||
var moreButtons = document.querySelectorAll(".PhotoGridMoreButton:not(.pgmbDisabled)");
|
||||
if (moreButtons.length > 0) {
|
||||
console.log("clicking load more button");
|
||||
moreButtons[0].click();
|
||||
this.idleSince = null;
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.idleSince == null) {
|
||||
console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
|
||||
this.idleSince = Date.now();
|
||||
return;
|
||||
} else {
|
||||
var doneButtons = document.querySelectorAll(".PhotoGridMoreButton.pgmbDisabled");
|
||||
if (Date.now() - this.idleSince > 9000 || (doneButtons.length > 0 && doneButtons[0].innerText === "All items loaded") ) {
|
||||
console.log("finished loading-thumbs, it appears we have reached the bottom");
|
||||
this.state = "clicking-first-thumb";
|
||||
this.idleSince = null;
|
||||
return;
|
||||
} else {
|
||||
// console.log("still might be waiting for something to load...");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (this.state === "clicking-first-thumb") {
|
||||
var images = document.querySelectorAll("a.pgmiImageLink");
|
||||
if (images && images !== "undefined") {
|
||||
this.imageCount = images.length;
|
||||
if (images.length > 0) {
|
||||
console.log("clicking first thumbnail");
|
||||
images[0].click();
|
||||
this.idleSince = null;
|
||||
this.state = "waiting-big-image";
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
console.log("no big images to load?");
|
||||
this.idleSince = Date.now();
|
||||
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.state === "waiting-big-image") {
|
||||
if(this.currentBigImage == null) {
|
||||
var imageFrame = document.querySelectorAll("div.Modal div.Item div.iMedia div.Image");
|
||||
if (imageFrame.length > 0 && imageFrame[0].getAttribute("src") !== this.previousBigImage ) {
|
||||
this.currentBigImage = new Image();
|
||||
this.currentBigImage.src = imageFrame[0].getAttribute("src");
|
||||
//console.log("this.currentBigImage.naturalWidth=" + this.currentBigImage.naturalWidth + " this.currentBigImage.src=" + this.currentBigImage.src);
|
||||
return;
|
||||
} else if(this.idleSince == null ) {
|
||||
console.log("waiting for image frame to load");
|
||||
this.idleSince = Date.now();
|
||||
return;
|
||||
}
|
||||
} else if (this.currentBigImage.src !== this.previousBigImage && this.currentBigImage.naturalWidth !== 0) {
|
||||
console.log("next big image appears loaded, will click right arrow next time");
|
||||
this.state = "click-next-big-image";
|
||||
this.previousBigImage = this.currentBigImage.src;
|
||||
this.currentBigImage = null;
|
||||
this.bigImagesLoaded++;
|
||||
this.idleSince = null;
|
||||
|
||||
if (this.bigImagesLoaded >= this.imageCount) {
|
||||
console.log("looks like we're done, we've loaded all " + this.bigImagesLoaded + " of " + this.imageCount + " big images");
|
||||
this.state = "finished";
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
return;
|
||||
} else if(this.idleSince == null) {
|
||||
console.log("Waiting for big image to load");
|
||||
this.idleSince = Date.now();
|
||||
return;
|
||||
}
|
||||
|
||||
var moreButtons = document.querySelectorAll("a._oidfu");
|
||||
if (moreButtons.length > 0) {
|
||||
console.log("clicking load more button");
|
||||
moreButtons[0].click();
|
||||
this.idleSince = null;
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.state === "click-next-big-image") {
|
||||
var rightArrow = document.querySelectorAll("a.mmRightArrow");
|
||||
if (rightArrow.length > 0) {
|
||||
// console.log("clicking right arrow");
|
||||
rightArrow[0].click();
|
||||
this.state = "waiting-big-image";
|
||||
this.idleSince = null;
|
||||
return;
|
||||
} else {
|
||||
console.warn("no right arrow to click?? weird");
|
||||
this.idleSince = Date.now();
|
||||
return;
|
||||
}
|
||||
if (this.idleSince == null) {
|
||||
console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
|
||||
this.idleSince = Date.now();
|
||||
return;
|
||||
}
|
||||
},
|
||||
|
||||
|
136
brozzler/behaviors.d/mouseovers.js.template
Normal file
136
brozzler/behaviors.d/mouseovers.js.template
Normal file
@ -0,0 +1,136 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/mouseovers.js.in - mouseovers behavior template,
|
||||
* mouseovers on elements matching templatized css selector
|
||||
*
|
||||
* Copyright (C) 2014-2016 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraBehavior = {
|
||||
IDLE_TIMEOUT_SEC : 10,
|
||||
idleSince : null,
|
||||
alreadyMouseovered : {},
|
||||
|
||||
intervalFunc : function() {
|
||||
var mouseoveredSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var somethingLeftAbove = false;
|
||||
var cssSelector = "${mouseover_css_selector}";
|
||||
var mouseoverUntilTimeout = "${mouseover_until_hard_timeout}";
|
||||
|
||||
//handle Python to JavaScript boolean conversion
|
||||
mouseoverUntilTimeout == "True" ? mouseoverUntilTimeout = true : mouseoverUntilTimeout = false;
|
||||
|
||||
var iframes = document.querySelectorAll("iframe");
|
||||
var documents = Array(iframes.length + 1);
|
||||
documents[0] = document;
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
documents[i+1] = iframes[i].contentWindow.document;
|
||||
}
|
||||
|
||||
for (var j = 0; j < documents.length; j++) {
|
||||
|
||||
var mouseoverTargets = documents[j].querySelectorAll(cssSelector);
|
||||
|
||||
for ( var i = 0; i < mouseoverTargets.length; i++) {
|
||||
if (mouseoverTargets[i].umbraMouseovered && !mouseoverUntilTimeout) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var where = this.aboveBelowOrOnScreen(mouseoverTargets[i]);
|
||||
|
||||
if (where == 0) {
|
||||
console.log("mouseovering on " + mouseoverTargets[i].outerHTML);
|
||||
// do mouse over event on mouseover target
|
||||
// since some urls are requsted only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
mouseoverTargets[i].dispatchEvent(mouseOverEvent);
|
||||
mouseoveredSomething = true;
|
||||
this.idleSince = null;
|
||||
mouseoverTargets[i].umbraMouseovered = true;
|
||||
|
||||
break; //break from mouseoverTargets loop, but not from iframe loop
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!mouseoveredSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
// console.log("scrolling UP because everything on this screen has been mouseovered but we missed something above");
|
||||
window.scrollBy(0, -500);
|
||||
this.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
// console.log("scrolling because everything on this screen has been mouseovered but there's more below document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (this.idleSince == null) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.idleSince) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
},
|
||||
|
||||
start : function() {
|
||||
var that = this;
|
||||
this.intervalId = setInterval(function() {
|
||||
that.intervalFunc()
|
||||
}, 250);
|
||||
},
|
||||
|
||||
isFinished : function() {
|
||||
if (this.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - this.idleSince;
|
||||
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
|
||||
clearInterval(this.intervalId);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
},
|
||||
|
||||
aboveBelowOrOnScreen : function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||
return 1; // below
|
||||
} else {
|
||||
return 0; // on screen
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
return umbraBehavior.isFinished()
|
||||
};
|
||||
|
||||
umbraBehavior.start();
|
@ -104,6 +104,11 @@ behaviors:
|
||||
url_regex: '^https?://(?:www\.)?fec.gov/data/.*$'
|
||||
behavior_js: fec_gov.js
|
||||
request_idle_timeout_sec: 10
|
||||
- url_regex: '^https?://(?:www\.)?news\.com\.au/.*$'
|
||||
behavior_js_template: mouseovers.js.template
|
||||
default_parameters:
|
||||
mouseover_css_selector: .menu-item a
|
||||
request_idle_timeout_sec: 10
|
||||
- # default fallback behavior
|
||||
url_regex: '^.*$'
|
||||
request_idle_timeout_sec: 10
|
||||
|
@ -1,5 +1,7 @@
|
||||
id:
|
||||
type: string
|
||||
type:
|
||||
- string
|
||||
- integer
|
||||
required: true
|
||||
|
||||
<<: &multi_level_options
|
||||
@ -79,4 +81,4 @@ seeds:
|
||||
type: url
|
||||
required: true
|
||||
|
||||
<<: *multi_level_options
|
||||
<<: *multi_level_options
|
||||
|
@ -54,11 +54,15 @@ class Url:
|
||||
return self._host
|
||||
|
||||
def matches_ip_or_domain(self, ip_or_domain):
|
||||
"""Returns true if
|
||||
- ip_or_domain is an ip address and self.host is the same ip address
|
||||
- ip_or_domain is a domain and self.host is the same domain
|
||||
- ip_or_domain is a domain and self.host is a subdomain of it
|
||||
"""
|
||||
Returns true if
|
||||
- ip_or_domain is an ip address and self.host is the same ip address
|
||||
- ip_or_domain is a domain and self.host is the same domain
|
||||
- ip_or_domain is a domain and self.host is a subdomain of it
|
||||
"""
|
||||
if not self.host:
|
||||
return False
|
||||
|
||||
if ip_or_domain == self.host:
|
||||
return True
|
||||
|
||||
|
@ -53,7 +53,7 @@ class YoutubeDLSpy(urllib.request.BaseHandler):
|
||||
self.reset()
|
||||
|
||||
def _http_response(self, request, response):
|
||||
self.transactions.append(YoutubeDLSpy.Transaction(request,response))
|
||||
self.transactions.append(YoutubeDLSpy.Transaction(request, response))
|
||||
return response
|
||||
|
||||
http_response = https_response = _http_response
|
||||
@ -175,11 +175,11 @@ class BrozzlerWorker:
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(request) as response:
|
||||
if response.status != 204:
|
||||
if response.getcode() != 204:
|
||||
self.logger.warn(
|
||||
'got "%s %s" response on warcprox '
|
||||
'WARCPROX_WRITE_RECORD request (expected 204)',
|
||||
response.status, response.reason)
|
||||
response.getcode(), response.reason)
|
||||
except urllib.error.HTTPError as e:
|
||||
self.logger.warn(
|
||||
'got "%s %s" response on warcprox '
|
||||
@ -197,7 +197,8 @@ class BrozzlerWorker:
|
||||
"with youtube-dl json for %s", page)
|
||||
self._warcprox_write_record(
|
||||
warcprox_address=self._proxy(site),
|
||||
url="youtube-dl:%s" % page.url, warc_type="metadata",
|
||||
url="youtube-dl:%s" % brozzler.fixup(page.url),
|
||||
warc_type="metadata",
|
||||
content_type="application/vnd.youtube-dl_formats+json;charset=utf-8",
|
||||
payload=info_json.encode("utf-8"),
|
||||
extra_headers=site.extra_headers())
|
||||
@ -237,12 +238,12 @@ class BrozzlerWorker:
|
||||
screenshot_jpeg, thumbnail_jpeg = self.full_and_thumb_jpegs(
|
||||
screenshot_png)
|
||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
||||
url="screenshot:{}".format(page.url),
|
||||
url="screenshot:%s" % brozzler.fixup(page.url),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=screenshot_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
self._warcprox_write_record(warcprox_address=self._proxy(site),
|
||||
url="thumbnail:{}".format(page.url),
|
||||
url="thumbnail:%s" % brozzler.fixup(page.url),
|
||||
warc_type="resource", content_type="image/jpeg",
|
||||
payload=thumbnail_jpeg,
|
||||
extra_headers=site.extra_headers())
|
||||
@ -311,7 +312,7 @@ class BrozzlerWorker:
|
||||
def _already_fetched(self, page, brozzler_spy):
|
||||
for txn in brozzler_spy.final_bounces(page.url):
|
||||
if (txn.request.get_method() == 'GET'
|
||||
and txn.response.status == 200):
|
||||
and txn.response.getcode() == 200):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user