Merge pull request #11 from eldondev/master

Convert behaviors to independent, runnable javascript files, hard timeout on pages
This commit is contained in:
vonrosen 2014-03-06 11:08:45 -08:00
commit 5b1992a8c0
5 changed files with 96 additions and 28 deletions

View file

@ -0,0 +1,58 @@
//.*facebook.com.*
var isOnScreen = function(e) {
var eTop = e.getBoundingClientRect().top;
return eTop > window.scrollY && eTop < window.scrollY + window.innerHeight;
};
var THINGS_TO_CLICK_SELECTOR = 'a.UFIPagerLink > span, a.UFIPagerLink, a[href^="/browse/likes"], span.UFIReplySocialSentenceLinkText, a.photo';
var alreadyClicked = {};
var intervalId;
var intervalFunc = function() {
var closeButton = document.querySelector('a[title="Close"]');
if (closeButton) {
console.log("clicking close button " + closeButton);
closeButton.click();
return;
}
var closeTheaterButton = document.querySelector('a.closeTheater');
if (closeTheaterButton && closeTheaterButton.offsetWidth > 0) {
console.log("clicking close button " + closeTheaterButton);
closeTheaterButton.click();
return;
}
var thingsToClick = document.querySelectorAll(THINGS_TO_CLICK_SELECTOR);
var clickedSomething = false;
var somethingLeftToClick = false;
for (var i = 0; i < thingsToClick.length; i++) {
var target = thingsToClick[i];
if (!(target in alreadyClicked)) {
if (isOnScreen(target)) {
// var pos = target.getBoundingClientRect().top;
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
console.log("clicking at " + target.getBoundingClientRect().top + " on " + target);
target.click();
target.style.border = '1px solid #0a0';
alreadyClicked[target] = true;
clickedSomething = true;
break;
} else {
somethingLeftToClick = true;
}
}
}
if (!clickedSomething) {
if (somethingLeftToClick) {
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 100);
} else if (window.scrollY + window.innerHeight + 10 < document.body.clientHeight) {
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
window.scrollBy(0, 100);
}
}
}
var intervalId = setInterval(intervalFunc, 200);

View file

@ -0,0 +1,16 @@
//.*flickr.com.*
setInterval(function() { window.scrollBy(0,50); }, 100);
setTimeout(function() {
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
f = a.iterateNext();
f.click();},
5000);
setTimeout(function() {
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
setInterval(function() {
f = a.iterateNext();
f.click();
}, 5000);
}, 5000);

View file

@ -1,14 +0,0 @@
[
{
"scripts": [
"setInterval(function() { window.scrollBy(0,500); }, 150);"
],
"site": ".*facebook.com.*"
},
{
"scripts": [
"setInterval(function() { window.scrollBy(0,50); }, 50);"
],
"site": ".*flickr.com.*"
}
]

View file

@ -2,17 +2,25 @@
from json import dumps, load from json import dumps, load
from time import sleep from time import sleep
from itertools import chain
import os, re import os, re
import logging import logging
behaviors_file = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.json']) behaviors_directory = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.d'])
behavior_files = chain(*[[dir + os.path.sep + file for file in files] for dir, dirs, files in os.walk(behaviors_directory)])
behaviors = []
for file_name in behavior_files:
lines = open(file_name).readlines()
pattern, script = lines[0][2:].strip(), ''.join(lines[1:])
behaviors.append({'site' : pattern, 'script': script})
print(behaviors)
def execute(url, websock, command_id): def execute(url, websock, command_id):
logger = logging.getLogger('behaviors') logger = logging.getLogger('behaviors')
with open(behaviors_file) as js: print(behaviors)
behaviors = load(js) for behavior in behaviors:
for behavior in behaviors: print("Comparing %s and %s" %(behavior['site'], url))
if re.match(behavior['site'], url): if re.match(behavior['site'], url):
for script in behavior['scripts']: msg = dumps(dict(method="Runtime.evaluate", params={"expression": behavior['script']}, id=next(command_id)))
msg = dumps(dict(method="Runtime.evaluate", params={"expression": script}, id=next(command_id))) logger.debug('sending message to {}: {}'.format(websock, msg))
logger.debug('sending message to {}: {}'.format(websock, msg)) websock.send(msg)
websock.send(msg)

View file

@ -13,6 +13,7 @@ import subprocess
import signal import signal
from kombu import Connection, Exchange, Queue from kombu import Connection, Exchange, Queue
import tempfile import tempfile
from umbra import behaviors
class UmbraWorker: class UmbraWorker:
logger = logging.getLogger('umbra.UmbraWorker') logger = logging.getLogger('umbra.UmbraWorker')
@ -27,6 +28,7 @@ class UmbraWorker:
self.client_id = client_id self.client_id = client_id
self.page_done = threading.Event() self.page_done = threading.Event()
self.idle_timer = None self.idle_timer = None
self.hard_stop_timer = None
def browse_page(self, url, url_metadata): def browse_page(self, url, url_metadata):
with self.lock: with self.lock:
@ -50,7 +52,9 @@ class UmbraWorker:
def _reset_idle_timer(self): def _reset_idle_timer(self):
if self.idle_timer: if self.idle_timer:
self.idle_timer.cancel() self.idle_timer.cancel()
self.idle_timer = threading.Timer(60, self.page_done.set) self.idle_timer = threading.Timer(10, self.page_done.set)
if not self.hard_stop_timer: #10 minutes is as long as we should give 1 page
self.hard_stop_timer = threading.Timer(600, self.page_done.set)
self.idle_timer.start() self.idle_timer.start()
def visit_page(self, websock): def visit_page(self, websock):
@ -84,7 +88,6 @@ class UmbraWorker:
self.send_request_to_amqp(message) self.send_request_to_amqp(message)
elif "method" in message.keys() and message["method"] == "Page.loadEventFired": elif "method" in message.keys() and message["method"] == "Page.loadEventFired":
self.logger.debug("got Page.loadEventFired, starting behaviors for {}".format(self.url)) self.logger.debug("got Page.loadEventFired, starting behaviors for {}".format(self.url))
from umbra import behaviors
behaviors.execute(self.url, websock, self.command_id) behaviors.execute(self.url, websock, self.command_id)
class Umbra: class Umbra:
@ -196,9 +199,6 @@ class Chrome:
self.chrome_process.wait() self.chrome_process.wait()
def main(): def main():
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
arg_parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]), arg_parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
description='umbra - Browser automation tool', description='umbra - Browser automation tool',