mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 00:29:53 -05:00
Convert from one big json file, to js files with a regex as a comment at the top.
This commit is contained in:
parent
4c22891093
commit
f2f78d2ced
52
umbra/behaviors.d/facebook.js
Normal file
52
umbra/behaviors.d/facebook.js
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
//.*facebook.com.*
|
||||||
|
var isOnScreen = function(e) {
|
||||||
|
var eTop = e.getBoundingClientRect().top;
|
||||||
|
return eTop > window.scrollY && eTop < window.scrollY + window.innerHeight;
|
||||||
|
};
|
||||||
|
|
||||||
|
var THINGS_TO_CLICK_SELECTOR = 'a.UFIPagerLink > span, a.UFIPagerLink, a[href^="/browse/likes"], span.UFIReplySocialSentenceLinkText';
|
||||||
|
var alreadyClicked = {};
|
||||||
|
var intervalId;
|
||||||
|
|
||||||
|
var intervalFunc = function() {
|
||||||
|
var closeButton = document.querySelector('a[title="Close"]');
|
||||||
|
if (closeButton) {
|
||||||
|
console.log("clicking close button " + closeButton);
|
||||||
|
closeButton.click();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var thingsToClick = document.querySelectorAll(THINGS_TO_CLICK_SELECTOR);
|
||||||
|
var clickedSomething = false;
|
||||||
|
var somethingLeftToClick = false;
|
||||||
|
|
||||||
|
for (var i = 0; i < thingsToClick.length; i++) {
|
||||||
|
var target = thingsToClick[i];
|
||||||
|
if (!(target in alreadyClicked)) {
|
||||||
|
if (isOnScreen(target)) {
|
||||||
|
// var pos = target.getBoundingClientRect().top;
|
||||||
|
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
||||||
|
console.log("clicking at " + target.getBoundingClientRect().top + " on " + target);
|
||||||
|
target.click();
|
||||||
|
target.style.border = '1px solid #0a0';
|
||||||
|
alreadyClicked[target] = true;
|
||||||
|
clickedSomething = true;
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
somethingLeftToClick = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!clickedSomething) {
|
||||||
|
if (somethingLeftToClick) {
|
||||||
|
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 100);
|
||||||
|
} else if (window.scrollY + window.innerHeight + 10 < document.body.clientHeight) {
|
||||||
|
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 100);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var intervalId = setInterval(intervalFunc, 200);
|
16
umbra/behaviors.d/flickr.js
Normal file
16
umbra/behaviors.d/flickr.js
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
//.*flickr.com.*
|
||||||
|
setInterval(function() { window.scrollBy(0,50); }, 100);
|
||||||
|
|
||||||
|
setTimeout(function() {
|
||||||
|
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||||
|
f = a.iterateNext();
|
||||||
|
f.click();},
|
||||||
|
5000);
|
||||||
|
|
||||||
|
setTimeout(function() {
|
||||||
|
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||||
|
setInterval(function() {
|
||||||
|
f = a.iterateNext();
|
||||||
|
f.click();
|
||||||
|
}, 5000);
|
||||||
|
}, 5000);
|
@ -1,14 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"scripts": [
|
|
||||||
"setInterval(function() { window.scrollBy(0,500); }, 150);"
|
|
||||||
],
|
|
||||||
"site": ".*facebook.com.*"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"scripts": [
|
|
||||||
"setInterval(function() { window.scrollBy(0,50); }, 50);"
|
|
||||||
],
|
|
||||||
"site": ".*flickr.com.*"
|
|
||||||
}
|
|
||||||
]
|
|
@ -2,17 +2,25 @@
|
|||||||
|
|
||||||
from json import dumps, load
|
from json import dumps, load
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
from itertools import chain
|
||||||
import os, re
|
import os, re
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
behaviors_file = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.json'])
|
behaviors_directory = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.d'])
|
||||||
|
behavior_files = chain(*[[dir + os.path.sep + file for file in files] for dir, dirs, files in os.walk(behaviors_directory)])
|
||||||
|
behaviors = []
|
||||||
|
for file_name in behavior_files:
|
||||||
|
lines = open(file_name).readlines()
|
||||||
|
pattern, script = lines[0][2:].strip(), ''.join(lines[1:])
|
||||||
|
behaviors.append({'site' : pattern, 'script': script})
|
||||||
|
|
||||||
|
print(behaviors)
|
||||||
def execute(url, websock, command_id):
|
def execute(url, websock, command_id):
|
||||||
logger = logging.getLogger('behaviors')
|
logger = logging.getLogger('behaviors')
|
||||||
with open(behaviors_file) as js:
|
print(behaviors)
|
||||||
behaviors = load(js)
|
for behavior in behaviors:
|
||||||
for behavior in behaviors:
|
print("Comparing %s and %s" %(behavior['site'], url))
|
||||||
if re.match(behavior['site'], url):
|
if re.match(behavior['site'], url):
|
||||||
for script in behavior['scripts']:
|
msg = dumps(dict(method="Runtime.evaluate", params={"expression": behavior['script']}, id=next(command_id)))
|
||||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": script}, id=next(command_id)))
|
logger.debug('sending message to {}: {}'.format(websock, msg))
|
||||||
logger.debug('sending message to {}: {}'.format(websock, msg))
|
websock.send(msg)
|
||||||
websock.send(msg)
|
|
||||||
|
@ -13,6 +13,7 @@ import subprocess
|
|||||||
import signal
|
import signal
|
||||||
from kombu import Connection, Exchange, Queue
|
from kombu import Connection, Exchange, Queue
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from umbra import behaviors
|
||||||
|
|
||||||
class UmbraWorker:
|
class UmbraWorker:
|
||||||
logger = logging.getLogger('umbra.UmbraWorker')
|
logger = logging.getLogger('umbra.UmbraWorker')
|
||||||
@ -27,6 +28,7 @@ class UmbraWorker:
|
|||||||
self.client_id = client_id
|
self.client_id = client_id
|
||||||
self.page_done = threading.Event()
|
self.page_done = threading.Event()
|
||||||
self.idle_timer = None
|
self.idle_timer = None
|
||||||
|
self.hard_stop_timer = None
|
||||||
|
|
||||||
def browse_page(self, url, url_metadata):
|
def browse_page(self, url, url_metadata):
|
||||||
with self.lock:
|
with self.lock:
|
||||||
@ -50,7 +52,9 @@ class UmbraWorker:
|
|||||||
def _reset_idle_timer(self):
|
def _reset_idle_timer(self):
|
||||||
if self.idle_timer:
|
if self.idle_timer:
|
||||||
self.idle_timer.cancel()
|
self.idle_timer.cancel()
|
||||||
self.idle_timer = threading.Timer(60, self.page_done.set)
|
self.idle_timer = threading.Timer(10, self.page_done.set)
|
||||||
|
if not self.hard_stop_timer: #10 minutes is as long as we should give 1 page
|
||||||
|
self.hard_stop_timer = threading.Timer(600, self.page_done.set)
|
||||||
self.idle_timer.start()
|
self.idle_timer.start()
|
||||||
|
|
||||||
def visit_page(self, websock):
|
def visit_page(self, websock):
|
||||||
@ -84,7 +88,6 @@ class UmbraWorker:
|
|||||||
self.send_request_to_amqp(message)
|
self.send_request_to_amqp(message)
|
||||||
elif "method" in message.keys() and message["method"] == "Page.loadEventFired":
|
elif "method" in message.keys() and message["method"] == "Page.loadEventFired":
|
||||||
self.logger.debug("got Page.loadEventFired, starting behaviors for {}".format(self.url))
|
self.logger.debug("got Page.loadEventFired, starting behaviors for {}".format(self.url))
|
||||||
from umbra import behaviors
|
|
||||||
behaviors.execute(self.url, websock, self.command_id)
|
behaviors.execute(self.url, websock, self.command_id)
|
||||||
|
|
||||||
class Umbra:
|
class Umbra:
|
||||||
@ -196,9 +199,6 @@ class Chrome:
|
|||||||
self.chrome_process.wait()
|
self.chrome_process.wait()
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
# logging.basicConfig(stream=sys.stdout, level=logging.INFO,
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
|
|
||||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
|
||||||
|
|
||||||
arg_parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
arg_parser = argparse.ArgumentParser(prog=os.path.basename(sys.argv[0]),
|
||||||
description='umbra - Browser automation tool',
|
description='umbra - Browser automation tool',
|
||||||
|
Loading…
x
Reference in New Issue
Block a user