diff --git a/README.md b/README.md index 331312e..11b8d38 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,7 @@ Install via pip from this repo, e.g. pip install git+https://github.com/internetarchive/umbra.git Umbra requires an AMQP messaging service like RabbitMQ. On Ubuntu, -`sudo apt-get install rabbitmq-server` will install and start RabbitMQ -at amqp://guest:guest@localhost:5672/%2f, which the default AMQP url for umbra. +`sudo apt-get install rabbitmq-server` will install and start RabbitMQ at amqp://guest:guest@localhost:5672/%2f, which is the default AMQP url for umbra. Run --- diff --git a/bin/queue-url b/bin/queue-url index a56d806..e93cfa8 100755 --- a/bin/queue-url +++ b/bin/queue-url @@ -15,7 +15,7 @@ arg_parser.add_argument('-u', '--url', dest='amqp_url', default='amqp://guest:gu help='URL identifying the AMQP server to talk to') arg_parser.add_argument('--exchange', dest='amqp_exchange', default='umbra', help='AMQP exchange name') -arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='url', +arg_parser.add_argument('--routing-key', dest='amqp_routing_key', default='urls', help='AMQP routing key') arg_parser.add_argument('-i', '--client-id', dest='client_id', default='load_url.0', help='client id - included in the json payload with each url; umbra uses this value as the routing key to send requests back to') diff --git a/umbra/behaviors.d/facebook.js b/umbra/behaviors.d/facebook.js index 475e3c4..e2eb271 100644 --- a/umbra/behaviors.d/facebook.js +++ b/umbra/behaviors.d/facebook.js @@ -8,18 +8,81 @@ var umbraAboveBelowOrOnScreen = function(e) { if (eTop < window.scrollY) { return -1; // above } else if (eTop > window.scrollY + window.innerHeight) { + // if (e.clientWidth != 0) { + // console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be below the screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e); + // } return 1; // below } else { + // if (e.clientWidth != 0) { + // console.warn("e.clientWidth=" + e.clientWidth + " though it appears to be on screen? e.getBoundingClientRect().top=" + eTop + " window.scrollY=" + window.scrollY + " window.innerHeight=" + window.innerHeight + " e=" + e); + // } return 0; // on screen } } // comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText' var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]'; +//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages +var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]'; +var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5; var umbraAlreadyClicked = {}; -var umbraState = {'idleSince':null,'expectingSomething':null}; +var umbraAlreadyScrolledThing = {}; +var umbraScrolledThingFailedScrollAttempts = {}; +var umbraState = {'idleSince':null,'expectingSomething':null,'bottomReachedScrollY':0}; var umbraIntervalFunc = function() { + + var thingsToScroll = document.querySelectorAll(UMBRA_THINGS_TO_SCROLL_SELECTOR); + var everythingScrolled = true; + + for (var i = 0; i < thingsToScroll.length; i++) { + var target = thingsToScroll[i]; + + if (!(target in umbraAlreadyScrolledThing)) { + + everythingScrolled = false; + + console.log("scrolling to " + target.scrollHeight + " on element with nodeName " + target.nodeName + " with id of " + target.id); + var lastScrollTop = target.scrollTop; + target.scrollTop = target.scrollHeight; + + umbraState.idleSince = null; + + if (target.scrollTop >= target.scrollHeight) { + umbraAlreadyScrolledThing[target] = true; + } + else if (target.scrollTop == lastScrollTop) { + if (umbraScrolledThingFailedScrollAttempts[target]) { + umbraScrolledThingFailedScrollAttempts[target]++; + } + else { + umbraScrolledThingFailedScrollAttempts[target] = 1; + } + + if (umbraScrolledThingFailedScrollAttempts[target] >= NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING) { + umbraAlreadyScrolledThing[target] = true; + } + } + else { + //reset failed count on a successful scroll + umbraScrolledThingFailedScrollAttempts[target] = 0; + } + } + else { + console.log("done scrolling for element with nodeName " + target.nodeName + " with id of " + target.id) + } + + umbraState.expectingSomething = null; + } + + if (thingsToScroll && thingsToScroll.length > 0 && everythingScrolled) { + if (umbraState.idleSince == null) { + umbraState.idleSince = Date.now(); + } + + return; + } + var closeButtons = document.querySelectorAll('a[title="Close"], a.closeTheater'); for (var i = 0; i < closeButtons.length; i++) { // XXX closeTheater buttons stick around in the dom after closing, clientWidth>0 is one way to check if they're visible @@ -69,19 +132,23 @@ var umbraIntervalFunc = function() { } } } + + if (window.scrollY > umbraState.bottomReachedScrollY) { + umbraState.bottomReachedScrollY = window.scrollY; + } if (!clickedSomething) { - if (somethingLeftAbove) { - console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); - window.scrollBy(0, -500); + if (somethingLeftBelow) { + // console.log("scrolling down because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight); + window.scrollBy(0, 300); umbraState.idleSince = null; - } else if (somethingLeftBelow) { - console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight); - window.scrollBy(0, 200); + } else if (umbraState.bottomReachedScrollY + window.innerHeight < document.documentElement.scrollHeight) { + // console.log("scrolling down because we haven't reached the bottom yet document.body.clientHeight=" + document.body.clientHeight); + window.scrollBy(0, 300); umbraState.idleSince = null; - } else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { - console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight); - window.scrollBy(0, 200); + } else if (somethingLeftAbove) { + // console.log("scrolling UP because we've already been to the bottom, everything on or below this screen has been clicked, but we missed something above"); + window.scrollBy(0, -600); umbraState.idleSince = null; } else if (umbraState.idleSince == null) { umbraState.idleSince = Date.now(); @@ -95,6 +162,7 @@ var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10; // Called from outside of this script. var umbraBehaviorFinished = function() { + if (umbraState.idleSince != null) { var idleTimeMs = Date.now() - umbraState.idleSince; if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { diff --git a/umbra/behaviors.d/marquette_edu.js b/umbra/behaviors.d/marquette_edu.js new file mode 100644 index 0000000..cdaab00 --- /dev/null +++ b/umbra/behaviors.d/marquette_edu.js @@ -0,0 +1,56 @@ +// {"url_regex":"^https?://(?:www\\.)?marquette\\.edu/.*$", "request_idle_timeout_sec":10} +// +// vim:set sw=8 et: +// + +var umbraState = {'idleSince':null}; +var umbraIntervalID = setInterval(umbraScrollInterval,50); +var umbraAlreadyClicked = {}; +function umbraScrollInterval() { + + //if not at the bottom + if(window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { + umbraScroll(); + umbraState.idleSince=null; + } + else { + var videoBoxes = document.querySelectorAll("div#vid_box a"); + var clickedVideo = false; + + for(i=0;i UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) { + return true; + } + } + return false; +} + diff --git a/umbra/behaviors.py b/umbra/behaviors.py index 52bce1c..41e6055 100644 --- a/umbra/behaviors.py +++ b/umbra/behaviors.py @@ -68,7 +68,8 @@ class Behavior: self.notify_of_activity() def is_finished(self): - msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate", params={"expression": "umbraBehaviorFinished()"}) + msg_id = self.umbra_worker.send_to_chrome(method="Runtime.evaluate", + suppress_logging=True, params={"expression":"umbraBehaviorFinished()"}) self.waiting_result_msg_ids.append(msg_id) request_idle_timeout_sec = 30 diff --git a/umbra/browser.py b/umbra/browser.py index dd042d2..68fa49a 100644 --- a/umbra/browser.py +++ b/umbra/browser.py @@ -144,11 +144,12 @@ class Browser: self._behavior = None - def send_to_chrome(self, **kwargs): + def send_to_chrome(self, suppress_logging=False, **kwargs): msg_id = next(self.command_id) kwargs['id'] = msg_id msg = json.dumps(kwargs) - self.logger.debug('sending message to {}: {}'.format(self._websock, msg)) + if not suppress_logging: + self.logger.debug('sending message to {}: {}'.format(self._websock, msg)) self._websock.send(msg) return msg_id diff --git a/umbra/controller.py b/umbra/controller.py index 7a74155..b37a588 100644 --- a/umbra/controller.py +++ b/umbra/controller.py @@ -117,6 +117,9 @@ class AmqpBrowserController: break # out of "while True" to acquire another browser except socket.timeout: pass + except socket.error: + self.logger.error("problem consuming messages from AMQP, will try reconnecting after active browsing finishes", exc_info=True) + self._reconnect_requested = True if self._consumer_stop.is_set() or time.time() - start >= timeout or self._reconnect_requested: browser.stop()