mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 16:19:49 -05:00
Merge branch 'master' of github.com:internetarchive/umbra into ari-3774
This commit is contained in:
commit
e9451f88d8
@ -22,8 +22,7 @@ Install via pip from this repo, e.g.
|
||||
pip install git+https://github.com/internetarchive/umbra.git
|
||||
|
||||
Umbra requires an AMQP messaging service like RabbitMQ. On Ubuntu,
|
||||
`sudo apt-get install rabbitmq-server` will install and start RabbitMQ
|
||||
at amqp://guest:guest@localhost:5672/%2f, which the default AMQP url for umbra.
|
||||
`sudo apt-get install rabbitmq-server` will install and start RabbitMQ at amqp://guest:guest@localhost:5672/%2f, which is the default AMQP url for umbra.
|
||||
|
||||
Run
|
||||
---
|
||||
|
@ -15,7 +15,7 @@ arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:gu
|
||||
help='URL identifying the AMQP server to talk to')
|
||||
arg_parser.add_argument('--exchange', dest='amqp_exchange', default='umbra',
|
||||
help='AMQP exchange name')
|
||||
arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='url',
|
||||
arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='urls',
|
||||
help='AMQP routing key')
|
||||
arg_parser.add_argument('-i', '--client-id', dest='client_id', default='load_url.0',
|
||||
help='client id - included in the json payload with each url; umbra uses this value as the routing key to send requests back to')
|
||||
|
@ -8,18 +8,81 @@ var umbraAboveBelowOrOnScreen = function(e) {
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||
// if (e.clientWidth != 0) {
|
||||
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be below the screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
|
||||
// }
|
||||
return 1; // below
|
||||
} else {
|
||||
// if (e.clientWidth != 0) {
|
||||
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be on screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
|
||||
// }
|
||||
return 0; // on screen
|
||||
}
|
||||
}
|
||||
|
||||
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
|
||||
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
|
||||
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
|
||||
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
|
||||
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
|
||||
var umbraAlreadyClicked = {};
|
||||
var umbraState = {'idleSince':null,'expectingSomething':null};
|
||||
var umbraAlreadyScrolledThing = {};
|
||||
var umbraScrolledThingFailedScrollAttempts = {};
|
||||
var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0};
|
||||
|
||||
var umbraIntervalFunc = function() {
|
||||
|
||||
var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR);
|
||||
var everythingScrolled = true;
|
||||
|
||||
for (var i = 0; i < thingsToScroll.length; i++) {
|
||||
var target = thingsToScroll[i];
|
||||
|
||||
if (!(target in umbraAlreadyScrolledThing)) {
|
||||
|
||||
everythingScrolled = false;
|
||||
|
||||
console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id);
|
||||
var lastScrollTop = target.scrollTop;
|
||||
target.scrollTop = target.scrollHeight;
|
||||
|
||||
umbraState.idleSince = null;
|
||||
|
||||
if (target.scrollTop >= target.scrollHeight) {
|
||||
umbraAlreadyScrolledThing[target] = true;
|
||||
}
|
||||
else if (target.scrollTop == lastScrollTop) {
|
||||
if (umbraScrolledThingFailedScrollAttempts[target]) {
|
||||
umbraScrolledThingFailedScrollAttempts[target]++;
|
||||
}
|
||||
else {
|
||||
umbraScrolledThingFailedScrollAttempts[target] = 1;
|
||||
}
|
||||
|
||||
if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) {
|
||||
umbraAlreadyScrolledThing[target] = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
//reset failed count on a successful scroll
|
||||
umbraScrolledThingFailedScrollAttempts[target] = 0;
|
||||
}
|
||||
}
|
||||
else {
|
||||
console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id)
|
||||
}
|
||||
|
||||
umbraState.expectingSomething = null;
|
||||
}
|
||||
|
||||
if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) {
|
||||
if (umbraState.idleSince == null) {
|
||||
umbraState.idleSince = Date.now();
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater');
|
||||
for (var i = 0; i < closeButtons.length; i++) {
|
||||
// XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible
|
||||
@ -69,19 +132,23 @@ var umbraIntervalFunc = function() {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (window.scrollY > umbraState.bottomReachedScrollY) {
|
||||
umbraState.bottomReachedScrollY = window.scrollY;
|
||||
}
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||
window.scrollBy(0, -500);
|
||||
if (somethingLeftBelow) {
|
||||
// console.log("scrolling down because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 300);
|
||||
umbraState.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
} else if (umbraState.bottomReachedScrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
// console.log("scrolling down because we haven't reached the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 300);
|
||||
umbraState.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
} else if (somethingLeftAbove) {
|
||||
// console.log("scrolling UP because we've already been to the bottom, everything on or below this screen has been clicked, but we missed something above");
|
||||
window.scrollBy(0, -600);
|
||||
umbraState.idleSince = null;
|
||||
} else if (umbraState.idleSince == null) {
|
||||
umbraState.idleSince = Date.now();
|
||||
@ -95,6 +162,7 @@ var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
|
||||
if (umbraState.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||
|
56
umbra/behaviors.d/marquette_edu.js
Normal file
56
umbra/behaviors.d/marquette_edu.js
Normal file
@ -0,0 +1,56 @@
|
||||
// {"url_regex":"^https?://(?:www\\.)?marquette\\.edu/.*$", "request_idle_timeout_sec":10}
|
||||
//
|
||||
// vim:set sw=8 et:
|
||||
//
|
||||
|
||||
var umbraState = {'idleSince':null};
|
||||
var umbraIntervalID = setInterval(umbraScrollInterval,50);
|
||||
var umbraAlreadyClicked = {};
|
||||
function umbraScrollInterval() {
|
||||
|
||||
//if not at the bottom
|
||||
if(window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
umbraScroll();
|
||||
umbraState.idleSince=null;
|
||||
}
|
||||
else {
|
||||
var videoBoxes = document.querySelectorAll("div#vid_box a");
|
||||
var clickedVideo = false;
|
||||
|
||||
for(i=0;i<videoBoxes.length;i++) {
|
||||
if(!(videoBoxes[i] in umbraAlreadyClicked)){
|
||||
videoBoxes[i].click();
|
||||
umbraState.idleSince=null;
|
||||
umbraAlreadyClicked[videoBoxes[i]]=true;
|
||||
clickedVideo=true;
|
||||
}
|
||||
}
|
||||
|
||||
if(!clickedVideo && umbraState.idleSince==null) {
|
||||
umbraState.idleSince=Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
function umbraScroll() {
|
||||
window.scrollBy(0,50);
|
||||
}
|
||||
|
||||
|
||||
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||
// time, then we consider ourselves finished with the page.
|
||||
|
||||
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
if (umbraState.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
@ -68,7 +68,8 @@ class Behavior:
|
||||
self.notify_of_activity()
|
||||
|
||||
def is_finished(self):
|
||||
msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate", params={"expression": "umbraBehaviorFinished()"})
|
||||
msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
|
||||
suppress_logging=True, params={"expression":"umbraBehaviorFinished()"})
|
||||
self.waiting_result_msg_ids.append(msg_id)
|
||||
|
||||
request_idle_timeout_sec = 30
|
||||
|
@ -144,11 +144,12 @@ class Browser:
|
||||
|
||||
self._behavior = None
|
||||
|
||||
def send_to_chrome(self, **kwargs):
|
||||
def send_to_chrome(self, suppress_logging=False, **kwargs):
|
||||
msg_id = next(self.command_id)
|
||||
kwargs['id'] = msg_id
|
||||
msg = json.dumps(kwargs)
|
||||
self.logger.debug('sending message to {}: {}'.format(self._websock, msg))
|
||||
if not suppress_logging:
|
||||
self.logger.debug('sending message to {}: {}'.format(self._websock, msg))
|
||||
self._websock.send(msg)
|
||||
return msg_id
|
||||
|
||||
|
@ -117,6 +117,9 @@ class AmqpBrowserController:
|
||||
break # out of "while True" to acquire another browser
|
||||
except socket.timeout:
|
||||
pass
|
||||
except socket.error:
|
||||
self.logger.error("problem consuming messages from AMQP, will try reconnecting after active browsing finishes", exc_info=True)
|
||||
self._reconnect_requested = True
|
||||
|
||||
if self._consumer_stop.is_set() or time.time() - start >= timeout or self._reconnect_requested:
|
||||
browser.stop()
|
||||
|
Loading…
x
Reference in New Issue
Block a user