scrolldown seems to get everything for flickr and facebook at the moment

This commit is contained in:
Noah Levitt 2014-02-14 17:57:04 -08:00
parent 28282641f2
commit b4846e1063
3 changed files with 23 additions and 38 deletions

View file

@ -1,27 +1,14 @@
[ [
{ {
"scripts": [ "scripts": [
"setInterval(function() { window.scrollBy(10000,10000); }, 1000);" "setInterval(function() { window.scrollBy(0,500); }, 150);"
],
"site": ".*"
},
{
"scripts": [
"setTimeout(function() { setInterval(function() { a = document.evaluate( '//a[(@href = \"#\" and @role = \"button\" and contains(.,\"more comments\")) or starts-with(@href, \"/browse/likes\")]', document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); f = a.iterateNext(); f.click();}, 1000);}, 5000);"
], ],
"site": ".*facebook.com.*" "site": ".*facebook.com.*"
}, },
{ {
"scripts": [ "scripts": [
"setTimeout(function() { a = document.evaluate( \"//a[contains(@class, 'sn-ico-slideshow')]\", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); f = a.iterateNext(); f.click();}, 5000);" "setInterval(function() { window.scrollBy(0,50); }, 50);"
],
"site": ".*flickr.com.*"
},
{
"scripts": [
"setTimeout(function() { a = document.evaluate( \"//a[contains(@data-track, 'photo-click')]\", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null ); setInterval(function() { f = a.iterateNext(); f.click();}, 5000)}, 5000);"
], ],
"site": ".*flickr.com.*" "site": ".*flickr.com.*"
} }
] ]

View file

@ -8,7 +8,6 @@ import logging
behaviors_file = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.json']) behaviors_file = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.json'])
def execute(url, websock, command_id): def execute(url, websock, command_id):
logger = logging.getLogger('behaviors') logger = logging.getLogger('behaviors')
sleep(5)
with open(behaviors_file) as js: with open(behaviors_file) as js:
behaviors = load(js) behaviors = load(js)
for behavior in behaviors: for behavior in behaviors:

View file

@ -48,7 +48,7 @@ class UmbraWorker:
def _reset_idle_timer(self): def _reset_idle_timer(self):
if self.idle_timer: if self.idle_timer:
self.idle_timer.cancel() self.idle_timer.cancel()
self.idle_timer = threading.Timer(10, self.page_done.set) self.idle_timer = threading.Timer(60, self.page_done.set)
self.idle_timer.start() self.idle_timer.start()
def visit_page(self, websock): def visit_page(self, websock):
@ -56,35 +56,34 @@ class UmbraWorker:
self.logger.debug('sending message to {}: {}'.format(websock, msg)) self.logger.debug('sending message to {}: {}'.format(websock, msg))
websock.send(msg) websock.send(msg)
msg = dumps(dict(method="Page.enable", id=next(self.command_id)))
self.logger.debug('sending message to {}: {}'.format(websock, msg))
websock.send(msg)
msg = dumps(dict(method="Page.navigate", id=next(self.command_id), params={"url": self.url})) msg = dumps(dict(method="Page.navigate", id=next(self.command_id), params={"url": self.url}))
self.logger.debug('sending message to {}: {}'.format(websock, msg)) self.logger.debug('sending message to {}: {}'.format(websock, msg))
websock.send(msg) websock.send(msg)
from umbra import behaviors def send_request_to_amqp(self, chrome_msg):
behaviors.execute(self.url, websock, self.command_id) payload = chrome_msg['params']['request']
payload['parentUrl'] = self.url
payload['parentUrlMetadata'] = self.url_metadata
self.logger.debug('sending to amqp exchange={} routing_key={} payload={}'.format(self.umbra.umbra_exchange, self.client_id, payload))
with self.umbra.producer_lock:
self.umbra.producer.publish(payload,
exchange=self.umbra.umbra_exchange,
routing_key=self.client_id)
def handle_message(self, websock, message): def handle_message(self, websock, message):
# self.logger.debug("handling message from websocket {} - {}".format(websock, message)) self.logger.debug("handling message from websocket {} - {}".format(websock, message[:95]))
self._reset_idle_timer()
message = loads(message) message = loads(message)
if "method" in message.keys() and message["method"] == "Network.requestWillBeSent": if "method" in message.keys() and message["method"] == "Network.requestWillBeSent":
self._reset_idle_timer() self.send_request_to_amqp(message)
payload = message['params']['request'] elif "method" in message.keys() and message["method"] == "Page.loadEventFired":
payload['parentUrl'] = self.url self.logger.debug("got Page.loadEventFired, starting behaviors for {}".format(self.url))
payload['parentUrlMetadata'] = self.url_metadata from umbra import behaviors
self.logger.debug('sending to amqp exchange={} routing_key={} payload={}'.format(self.umbra.umbra_exchange, self.client_id, payload)) behaviors.execute(self.url, websock, self.command_id)
# bind a queue with the same name as the return routing key
# (AMQPUrlReceiver in heritrix expects this)
request_queue = Queue(self.client_id,
routing_key=self.client_id,
exchange=self.umbra.umbra_exchange)
with self.umbra.producer_lock:
# self.umbra.producer.publish(payload,
# routing_key=self.client_id,
# exchange=self.umbra.umbra_exchange,
# declare=[request_queue])
self.umbra.producer.publish(payload,
routing_key=self.client_id,
exchange=self.umbra.umbra_exchange)
def get_message_handler(self, url, url_metadata, command_id): def get_message_handler(self, url, url_metadata, command_id):
this_watchdog = self.watchdog(command_id) this_watchdog = self.watchdog(command_id)