Merge branch 'master' of github.com:internetarchive/umbra into ari-3774

This commit is contained in:
Hunter Stern 2015-01-21 16:21:13 -08:00
commit e9451f88d8
7 changed files with 144 additions and 16 deletions

View File

@ -22,8 +22,7 @@ Install via pip from this repo, e.g.
pip install git+https://github.com/internetarchive/umbra.git
Umbra requires an AMQP messaging service like RabbitMQ. On Ubuntu,
`sudo apt-get install rabbitmq-server` will install and start RabbitMQ
at amqp://guest:guest@localhost:5672/%2f, which the default AMQP url for umbra.
`sudo apt-get install rabbitmq-server` will install and start RabbitMQ at amqp://guest:guest@localhost:5672/%2f, which is the default AMQP url for umbra.
Run
---

View File

@ -15,7 +15,7 @@ arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:gu
help='URL identifying the AMQP server to talk to')
arg_parser.add_argument('--exchange', dest='amqp_exchange', default='umbra',
help='AMQP exchange name')
arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='url',
arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='urls',
help='AMQP routing key')
arg_parser.add_argument('-i', '--client-id', dest='client_id', default='load_url.0',
help='client id - included in the json payload with each url; umbra uses this value as the routing key to send requests back to')

View File

@ -8,18 +8,81 @@ var umbraAboveBelowOrOnScreen = function(e) {
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
// if (e.clientWidth != 0) {
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be below the screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
// }
return 1; // below
} else {
// if (e.clientWidth != 0) {
// console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be on screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e);
// }
return 0; // on screen
}
}
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
var umbraAlreadyClicked = {};
var umbraState = {'idleSince':null,'expectingSomething':null};
var umbraAlreadyScrolledThing = {};
var umbraScrolledThingFailedScrollAttempts = {};
var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0};
var umbraIntervalFunc = function() {
var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR);
var everythingScrolled = true;
for (var i = 0; i < thingsToScroll.length; i++) {
var target = thingsToScroll[i];
if (!(target in umbraAlreadyScrolledThing)) {
everythingScrolled = false;
console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id);
var lastScrollTop = target.scrollTop;
target.scrollTop = target.scrollHeight;
umbraState.idleSince = null;
if (target.scrollTop >= target.scrollHeight) {
umbraAlreadyScrolledThing[target] = true;
}
else if (target.scrollTop == lastScrollTop) {
if (umbraScrolledThingFailedScrollAttempts[target]) {
umbraScrolledThingFailedScrollAttempts[target]++;
}
else {
umbraScrolledThingFailedScrollAttempts[target] = 1;
}
if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) {
umbraAlreadyScrolledThing[target] = true;
}
}
else {
//reset failed count on a successful scroll
umbraScrolledThingFailedScrollAttempts[target] = 0;
}
}
else {
console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id)
}
umbraState.expectingSomething = null;
}
if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) {
if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now();
}
return;
}
var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater');
for (var i = 0; i < closeButtons.length; i++) {
// XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible
@ -69,19 +132,23 @@ var umbraIntervalFunc = function() {
}
}
}
if (window.scrollY > umbraState.bottomReachedScrollY) {
umbraState.bottomReachedScrollY = window.scrollY;
}
if (!clickedSomething) {
if (somethingLeftAbove) {
console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
window.scrollBy(0, -500);
if (somethingLeftBelow) {
// console.log("scrolling down because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 300);
umbraState.idleSince = null;
} else if (somethingLeftBelow) {
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 200);
} else if (umbraState.bottomReachedScrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling down because we haven't reached the bottom yet document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 300);
umbraState.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 200);
} else if (somethingLeftAbove) {
// console.log("scrolling UP because we've already been to the bottom, everything on or below this screen has been clicked, but we missed something above");
window.scrollBy(0, -600);
umbraState.idleSince = null;
} else if (umbraState.idleSince == null) {
umbraState.idleSince = Date.now();
@ -95,6 +162,7 @@ var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
// Called from outside of this script.
var umbraBehaviorFinished = function() {
if (umbraState.idleSince != null) {
var idleTimeMs = Date.now() - umbraState.idleSince;
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {

View File

@ -0,0 +1,56 @@
// {"url_regex":"^https?://(?:www\\.)?marquette\\.edu/.*$", "request_idle_timeout_sec":10}
//
// vim:set sw=8 et:
//
var umbraState = {'idleSince':null};
var umbraIntervalID = setInterval(umbraScrollInterval,50);
var umbraAlreadyClicked = {};
function umbraScrollInterval() {
//if not at the bottom
if(window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
umbraScroll();
umbraState.idleSince=null;
}
else {
var videoBoxes = document.querySelectorAll("div#vid_box a");
var clickedVideo = false;
for(i=0;i<videoBoxes.length;i++) {
if(!(videoBoxes[i] in umbraAlreadyClicked)){
videoBoxes[i].click();
umbraState.idleSince=null;
umbraAlreadyClicked[videoBoxes[i]]=true;
clickedVideo=true;
}
}
if(!clickedVideo && umbraState.idleSince==null) {
umbraState.idleSince=Date.now();
}
}
}
function umbraScroll() {
window.scrollBy(0,50);
}
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
// time, then we consider ourselves finished with the page.
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
// Called from outside of this script.
var umbraBehaviorFinished = function() {
if (umbraState.idleSince != null) {
var idleTimeMs = Date.now() - umbraState.idleSince;
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
return true;
}
}
return false;
}

View File

@ -68,7 +68,8 @@ class Behavior:
self.notify_of_activity()
def is_finished(self):
msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate", params={"expression": "umbraBehaviorFinished()"})
msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate",
suppress_logging=True, params={"expression":"umbraBehaviorFinished()"})
self.waiting_result_msg_ids.append(msg_id)
request_idle_timeout_sec = 30

View File

@ -144,11 +144,12 @@ class Browser:
self._behavior = None
def send_to_chrome(self, **kwargs):
def send_to_chrome(self, suppress_logging=False, **kwargs):
msg_id = next(self.command_id)
kwargs['id'] = msg_id
msg = json.dumps(kwargs)
self.logger.debug('sending message to {}: {}'.format(self._websock, msg))
if not suppress_logging:
self.logger.debug('sending message to {}: {}'.format(self._websock, msg))
self._websock.send(msg)
return msg_id

View File

@ -117,6 +117,9 @@ class AmqpBrowserController:
break # out of "while True" to acquire another browser
except socket.timeout:
pass
except socket.error:
self.logger.error("problem consuming messages from AMQP, will try reconnecting after active browsing finishes", exc_info=True)
self._reconnect_requested = True
if self._consumer_stop.is_set() or time.time() - start >= timeout or self._reconnect_requested:
browser.stop()