mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Convert from one big json file, to js files with a regex as a comment at the top.
This commit is contained in:
parent
4c22891093
commit
f2f78d2ced
52
umbra/behaviors.d/facebook.js
Normal file
52
umbra/behaviors.d/facebook.js
Normal file
@ -0,0 +1,52 @@
|
||||
//.*facebook.com.*
|
||||
var isOnScreen = function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
return eTop > window.scrollY && eTop < window.scrollY + window.innerHeight;
|
||||
};
|
||||
|
||||
var THINGS_TO_CLICK_SELECTOR = 'a.UFIPagerLink > span, a.UFIPagerLink, a[href^="/browse/likes"], span.UFIReplySocialSentenceLinkText';
|
||||
var alreadyClicked = {};
|
||||
var intervalId;
|
||||
|
||||
var intervalFunc = function() {
|
||||
var closeButton = document.querySelector('a[title="Close"]');
|
||||
if (closeButton) {
|
||||
console.log("clicking close button " + closeButton);
|
||||
closeButton.click();
|
||||
return;
|
||||
}
|
||||
|
||||
var thingsToClick = document.querySelectorAll(THINGS_TO_CLICK_SELECTOR);
|
||||
var clickedSomething = false;
|
||||
var somethingLeftToClick = false;
|
||||
|
||||
for (var i = 0; i < thingsToClick.length; i++) {
|
||||
var target = thingsToClick[i];
|
||||
if (!(target in alreadyClicked)) {
|
||||
if (isOnScreen(target)) {
|
||||
// var pos = target.getBoundingClientRect().top;
|
||||
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
||||
console.log("clicking at " + target.getBoundingClientRect().top + " on " + target);
|
||||
target.click();
|
||||
target.style.border = '1px solid #0a0';
|
||||
alreadyClicked[target] = true;
|
||||
clickedSomething = true;
|
||||
break;
|
||||
} else {
|
||||
somethingLeftToClick = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftToClick) {
|
||||
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 100);
|
||||
} else if (window.scrollY + window.innerHeight + 10 < document.body.clientHeight) {
|
||||
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 100);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var intervalId = setInterval(intervalFunc, 200);
|
16
umbra/behaviors.d/flickr.js
Normal file
16
umbra/behaviors.d/flickr.js
Normal file
@ -0,0 +1,16 @@
|
||||
//.*flickr.com.*
|
||||
setInterval(function() { window.scrollBy(0,50); }, 100);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
f = a.iterateNext();
|
||||
f.click();},
|
||||
5000);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
setInterval(function() {
|
||||
f = a.iterateNext();
|
||||
f.click();
|
||||
}, 5000);
|
||||
}, 5000);
|
@ -1,14 +0,0 @@
|
||||
[
|
||||
{
|
||||
"scripts": [
|
||||
"setInterval(function() { window.scrollBy(0,500); }, 150);"
|
||||
],
|
||||
"site": ".*facebook.com.*"
|
||||
},
|
||||
{
|
||||
"scripts": [
|
||||
"setInterval(function() { window.scrollBy(0,50); }, 50);"
|
||||
],
|
||||
"site": ".*flickr.com.*"
|
||||
}
|
||||
]
|
@ -2,17 +2,25 @@
|
||||
|
||||
from json import dumps, load
|
||||
from time import sleep
|
||||
from itertools import chain
|
||||
import os, re
|
||||
import logging
|
||||
|
||||
behaviors_file = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.json'])
|
||||
behaviors_directory = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.d'])
|
||||
behavior_files = chain(*[[dir + os.path.sep + file for file in files] for dir, dirs, files in os.walk(behaviors_directory)])
|
||||
behaviors = []
|
||||
for file_name in behavior_files:
|
||||
lines = open(file_name).readlines()
|
||||
pattern, script = lines[0][2:].strip(), ''.join(lines[1:])
|
||||
behaviors.append({'site' : pattern, 'script': script})
|
||||
|
||||
print(behaviors)
|
||||
def execute(url, websock, command_id):
|
||||
logger = logging.getLogger('behaviors')
|
||||
with open(behaviors_file) as js:
|
||||
behaviors = load(js)
|
||||
print(behaviors)
|
||||
for behavior in behaviors:
|
||||
print("Comparing %s and %s" %(behavior['site'], url))
|
||||
if re.match(behavior['site'], url):
|
||||
for script in behavior['scripts']:
|
||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": script}, id=next(command_id)))
|
||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": behavior['script']}, id=next(command_id)))
|
||||
logger.debug('sending message to {}: {}'.format(websock, msg))
|
||||
websock.send(msg)
|
||||
|
@ -13,6 +13,7 @@ import subprocess
|
||||
import signal
|
||||
from kombu import Connection, Exchange, Queue
|
||||
import tempfile
|
||||
from umbra import behaviors
|
||||
|
||||
class UmbraWorker:
|
||||
logger = logging.getLogger('umbra.UmbraWorker')
|
||||
@ -27,6 +28,7 @@ class UmbraWorker:
|
||||
self.client_id = client_id
|
||||
self.page_done = threading.Event()
|
||||
self.idle_timer = None
|
||||
self.hard_stop_timer = None
|
||||
|
||||
def browse_page(self, url, url_metadata):
|
||||
with self.lock:
|
||||
@ -50,7 +52,9 @@ class UmbraWorker:
|
||||
def _reset_idle_timer(self):
|
||||
if self.idle_timer:
|
||||
self.idle_timer.cancel()
|
||||
self.idle_timer = threading.Timer(60, self.page_done.set)
|
||||
self.idle_timer = threading.Timer(10, self.page_done.set)
|
||||
if not self.hard_stop_timer: #10 minutes is as long as we should give 1 page
|
||||
self.hard_stop_timer = threading.Timer(600, self.page_done.set)
|
||||
self.idle_timer.start()
|
||||
|
||||
def visit_page(self, websock):
|
||||
@ -84,7 +88,6 @@ class UmbraWorker:
|
||||
self.send_request_to_amqp(message)
|
||||
elif "method" in message.keys() and message["method"] == "Page.loadEventFired":
|
||||
self.logger.debug("got Page.loadEventFired, starting behaviors for {}".format(self.url))
|
||||
from umbra import behaviors
|
||||
behaviors.execute(self.url, websock, self.command_id)
|
||||
|
||||
class Umbra:
|
||||
@ -196,9 +199,6 @@ class Chrome:
|
||||
self.chrome_process.wait()
|
||||
|
||||
def main():
|
||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
|
||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||
description='umbra - Browser automation tool',
|
||||
|
Loading…
x
Reference in New Issue
Block a user