mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 12:54:23 -04:00
change magic first line of behavior js files to a commented-out json blob, which should include the fields 'url_regex' and 'request_idle_timeout_sec'; behavior.is_finished() incorporates the custom idle timeout into its check; also rename variables in behavior scripts with umbra/UMBRA_ prefix to sort of namespace them; and add "finished" logic to facebook and vimeo behaviors (flickr needs work to support it)
This commit is contained in:
parent
2a9633ad77
commit
a62a07e6b7
5 changed files with 130 additions and 67 deletions
|
@ -1,23 +1,13 @@
|
||||||
|
// {"request_idle_timeout_sec":10}
|
||||||
|
//
|
||||||
// vim:set sw=8 et:
|
// vim:set sw=8 et:
|
||||||
|
//
|
||||||
|
// Scrolls to the bottom of the page. That's it at the moment.
|
||||||
|
//
|
||||||
|
|
||||||
// STATES = ['NASCENT', 'NEED_SCROLL', 'WAITING', 'FINISHED']
|
|
||||||
|
|
||||||
// var transition = prepareTransition(state);
|
|
||||||
// if (transition.callback) {
|
|
||||||
// newState.callback(state, newState);
|
|
||||||
// }
|
|
||||||
// state = newState;
|
|
||||||
|
|
||||||
// if (state.status === 'NASCENT') {
|
|
||||||
// } else if (state.status == 'NEED_SCROLL') {
|
|
||||||
// } else if (state.status == 'FINISHED') {
|
|
||||||
|
|
||||||
var UMBRA_FINISH_AFTER_IDLE_TIME = 10 * 1000; // ms
|
|
||||||
var umbraState = {'idleSince':null};
|
var umbraState = {'idleSince':null};
|
||||||
var umbraFinished = false;
|
var umbraFinished = false;
|
||||||
var umbraIntervalFunc = function() {
|
var umbraIntervalFunc = function() {
|
||||||
// var needToScroll = (window.scrollY + window.innerHeight + 10 < document.body.clientHeight);
|
|
||||||
// var needToScroll = (document.documentElement.scrollTop + document.documentElement.clientHeight < document.documentElement.scrollHeight);
|
|
||||||
var needToScroll = (window.scrollY + window.innerHeight < document.documentElement.scrollHeight);
|
var needToScroll = (window.scrollY + window.innerHeight < document.documentElement.scrollHeight);
|
||||||
|
|
||||||
// console.log('intervalFunc umbraState.idleSince=' + umbraState.idleSince + ' needToScroll=' + needToScroll + ' window.scrollY=' + window.scrollY + ' window.innerHeight=' + window.innerHeight + ' document.documentElement.scrollHeight=' + document.documentElement.scrollHeight);
|
// console.log('intervalFunc umbraState.idleSince=' + umbraState.idleSince + ' needToScroll=' + needToScroll + ' window.scrollY=' + window.scrollY + ' window.innerHeight=' + window.innerHeight + ' document.documentElement.scrollHeight=' + document.documentElement.scrollHeight);
|
||||||
|
@ -29,10 +19,15 @@ var umbraIntervalFunc = function() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||||
|
// time, then we consider ourselves finished with the page.
|
||||||
|
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||||
|
|
||||||
|
// Called from outside of this script.
|
||||||
var umbraBehaviorFinished = function() {
|
var umbraBehaviorFinished = function() {
|
||||||
if (umbraState.idleSince != null) {
|
if (umbraState.idleSince != null) {
|
||||||
var idleTime = Date.now() - umbraState.idleSince;
|
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||||
if (idleTime > UMBRA_FINISH_AFTER_IDLE_TIME) {
|
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
//^https?://(?:www\.)?facebook\.com/.*$
|
// {"url_regex":"^https?://(?:www\\.)?facebook\\.com/.*$", "request_idle_timeout_sec":30}
|
||||||
|
//
|
||||||
// vim:set sw=8 et:
|
// vim:set sw=8 et:
|
||||||
|
//
|
||||||
|
|
||||||
var aboveBelowOrOnScreen = function(e) {
|
var umbraAboveBelowOrOnScreen = function(e) {
|
||||||
var eTop = e.getBoundingClientRect().top;
|
var eTop = e.getBoundingClientRect().top;
|
||||||
if (eTop < window.scrollY) {
|
if (eTop < window.scrollY) {
|
||||||
return -1; // above
|
return -1; // above
|
||||||
|
@ -13,11 +15,11 @@ var aboveBelowOrOnScreen = function(e) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
|
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
|
||||||
var THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
|
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
|
||||||
var alreadyClicked = {};
|
var umbraAlreadyClicked = {};
|
||||||
var intervalId;
|
var umbraState = {'idleSince':null};
|
||||||
|
|
||||||
var intervalFunc = function() {
|
var umbraIntervalFunc = function() {
|
||||||
var closeButton = document.querySelector('a[title="Close"]');
|
var closeButton = document.querySelector('a[title="Close"]');
|
||||||
if (closeButton) {
|
if (closeButton) {
|
||||||
console.log("clicking close button " + closeButton.outerHTML);
|
console.log("clicking close button " + closeButton.outerHTML);
|
||||||
|
@ -31,15 +33,15 @@ var intervalFunc = function() {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var thingsToClick = document.querySelectorAll(THINGS_TO_CLICK_SELECTOR);
|
var thingsToClick = document.querySelectorAll(UMBRA_THINGS_TO_CLICK_SELECTOR);
|
||||||
var clickedSomething = false;
|
var clickedSomething = false;
|
||||||
var somethingLeftBelow = false;
|
var somethingLeftBelow = false;
|
||||||
var missedAbove = 0;
|
var missedAbove = 0;
|
||||||
|
|
||||||
for (var i = 0; i < thingsToClick.length; i++) {
|
for (var i = 0; i < thingsToClick.length; i++) {
|
||||||
var target = thingsToClick[i];
|
var target = thingsToClick[i];
|
||||||
if (!(target in alreadyClicked)) {
|
if (!(target in umbraAlreadyClicked)) {
|
||||||
var where = aboveBelowOrOnScreen(target);
|
var where = umbraAboveBelowOrOnScreen(target);
|
||||||
if (where == 0) { // on screen
|
if (where == 0) { // on screen
|
||||||
// var pos = target.getBoundingClientRect().top;
|
// var pos = target.getBoundingClientRect().top;
|
||||||
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
||||||
|
@ -48,8 +50,9 @@ var intervalFunc = function() {
|
||||||
target.click();
|
target.click();
|
||||||
}
|
}
|
||||||
target.style.border = '1px solid #0a0';
|
target.style.border = '1px solid #0a0';
|
||||||
alreadyClicked[target] = true;
|
umbraAlreadyClicked[target] = true;
|
||||||
clickedSomething = true;
|
clickedSomething = true;
|
||||||
|
umbraState.idleSince = null;
|
||||||
break;
|
break;
|
||||||
} else if (where > 0) {
|
} else if (where > 0) {
|
||||||
somethingLeftBelow = true;
|
somethingLeftBelow = true;
|
||||||
|
@ -67,11 +70,31 @@ var intervalFunc = function() {
|
||||||
if (somethingLeftBelow) {
|
if (somethingLeftBelow) {
|
||||||
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||||
window.scrollBy(0, 200);
|
window.scrollBy(0, 200);
|
||||||
|
umbraState.idleSince = null;
|
||||||
} else if (window.scrollY + window.innerHeight + 10 < document.body.clientHeight) {
|
} else if (window.scrollY + window.innerHeight + 10 < document.body.clientHeight) {
|
||||||
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||||
window.scrollBy(0, 200);
|
window.scrollBy(0, 200);
|
||||||
|
umbraState.idleSince = null;
|
||||||
|
} else if (umbraState.idleSince == null) {
|
||||||
|
umbraState.idleSince = Date.now();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var intervalId = setInterval(intervalFunc, 200);
|
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||||
|
// time, then we consider ourselves finished with the page.
|
||||||
|
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||||
|
|
||||||
|
// Called from outside of this script.
|
||||||
|
var umbraBehaviorFinished = function() {
|
||||||
|
if (umbraState.idleSince != null) {
|
||||||
|
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||||
|
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
|
||||||
|
|
|
@ -1,17 +1,20 @@
|
||||||
//^https?://(?:www\.)?flickr\.com/.*$
|
// {"url_regex":"^https?://(?:www\\.)?flickr\\.com/.*$", "request_idle_timeout_sec":10}
|
||||||
|
//
|
||||||
|
// vim:set sw=8 et:
|
||||||
|
//
|
||||||
|
|
||||||
setInterval(function() { window.scrollBy(0,50); }, 100);
|
setInterval(function() { window.scrollBy(0,50); }, 100);
|
||||||
|
|
||||||
setTimeout(function() {
|
setTimeout(function() {
|
||||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||||
f = a.iterateNext();
|
|
||||||
f.click();},
|
|
||||||
5000);
|
|
||||||
|
|
||||||
setTimeout(function() {
|
|
||||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
|
||||||
setInterval(function() {
|
|
||||||
f = a.iterateNext();
|
f = a.iterateNext();
|
||||||
f.click();
|
f.click();
|
||||||
}, 5000);
|
}, 5000);
|
||||||
|
|
||||||
|
setTimeout(function() {
|
||||||
|
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||||
|
setInterval(function() {
|
||||||
|
f = a.iterateNext();
|
||||||
|
f.click();
|
||||||
|
}, 5000);
|
||||||
}, 5000);
|
}, 5000);
|
||||||
|
|
|
@ -1,7 +1,27 @@
|
||||||
//^https?://(?:www\.)?vimeo.com/.*$
|
// {"url_regex":"^https?://(?:www\\.)?vimeo\\.com/.*$", "request_idle_timeout_sec":10}
|
||||||
|
//
|
||||||
|
// vim:set sw=8 et:
|
||||||
|
//
|
||||||
|
|
||||||
var videoElements = document.getElementsByTagName('video');
|
var umbraState = {'idleSince':null};
|
||||||
for (var i = 0; i < videoElements.length; i++) {
|
var umbraVideoElements = document.getElementsByTagName('video');
|
||||||
videoElements[i].play();
|
for (var i = 0; i < umbraVideoElements.length; i++) {
|
||||||
|
umbraVideoElements[i].play();
|
||||||
|
}
|
||||||
|
umbraState.idleSince = Date.now();
|
||||||
|
|
||||||
|
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||||
|
// time, then we consider ourselves finished with the page.
|
||||||
|
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||||
|
|
||||||
|
// Called from outside of this script.
|
||||||
|
var umbraBehaviorFinished = function() {
|
||||||
|
if (umbraState.idleSince != null) {
|
||||||
|
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||||
|
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,16 +1,18 @@
|
||||||
# vim: set sw=4 et:
|
# vim: set sw=4 et:
|
||||||
|
|
||||||
from json import dumps, load
|
import json
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
import os, re
|
import os
|
||||||
|
import re
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
import sys
|
||||||
|
|
||||||
class Behavior:
|
class Behavior:
|
||||||
logger = logging.getLogger('umbra.behaviors.Behavior')
|
logger = logging.getLogger('umbra.behaviors.Behavior')
|
||||||
|
|
||||||
_behaviors = None
|
_behaviors = None
|
||||||
_default_behavior_script = None
|
_default_behavior = None
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def behaviors():
|
def behaviors():
|
||||||
|
@ -20,21 +22,29 @@ class Behavior:
|
||||||
Behavior._behaviors = []
|
Behavior._behaviors = []
|
||||||
for file_name in behavior_files:
|
for file_name in behavior_files:
|
||||||
Behavior.logger.debug("reading behavior file {}".format(file_name))
|
Behavior.logger.debug("reading behavior file {}".format(file_name))
|
||||||
lines = open(file_name).readlines()
|
script = open(file_name, encoding='utf-8').read()
|
||||||
pattern, script = lines[0][2:].strip(), ''.join(lines[1:])
|
first_line = script[:script.find('\n')]
|
||||||
Behavior._behaviors.append({'url_regex': pattern, 'script': script, 'file': file_name})
|
behavior = json.loads(first_line[2:].strip())
|
||||||
Behavior.logger.info("will run behaviors from {} to urls matching {}".format(file_name, pattern))
|
behavior['script'] = script
|
||||||
|
behavior['file'] = file_name
|
||||||
|
Behavior._behaviors.append(behavior)
|
||||||
|
Behavior.logger.info("will run behaviors from {} on urls matching {}".format(file_name, behavior['url_regex']))
|
||||||
|
|
||||||
return Behavior._behaviors
|
return Behavior._behaviors
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def default_behavior_script():
|
def default_behavior():
|
||||||
if Behavior._default_behavior_script is None:
|
if Behavior._default_behavior is None:
|
||||||
behaviors_directory = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.d'])
|
behaviors_directory = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.d'])
|
||||||
file_name = os.path.join(behaviors_directory, 'default.js')
|
file_name = os.path.join(behaviors_directory, 'default.js')
|
||||||
Behavior.logger.debug("reading default behavior file {}".format(file_name))
|
Behavior.logger.debug("reading default behavior file {}".format(file_name))
|
||||||
Behavior._default_behavior_script = open(file_name).read()
|
script = open(file_name, encoding='utf-8').read()
|
||||||
return Behavior._default_behavior_script
|
first_line = script[:script.find('\n')]
|
||||||
|
behavior = json.loads(first_line[2:].strip())
|
||||||
|
behavior['script'] = script
|
||||||
|
behavior['file'] = file_name
|
||||||
|
Behavior._default_behavior = behavior
|
||||||
|
return Behavior._default_behavior
|
||||||
|
|
||||||
def __init__(self, url, websock, command_id):
|
def __init__(self, url, websock, command_id):
|
||||||
self.url = url
|
self.url = url
|
||||||
|
@ -43,32 +53,37 @@ class Behavior:
|
||||||
|
|
||||||
self.script_finished = False
|
self.script_finished = False
|
||||||
self.waiting_result_msg_ids = []
|
self.waiting_result_msg_ids = []
|
||||||
|
self.active_behavior = None
|
||||||
|
self.last_activity = time.time()
|
||||||
|
|
||||||
def start(self):
|
def start(self):
|
||||||
self.notify_of_activity()
|
|
||||||
|
|
||||||
script_started = False
|
|
||||||
for behavior in Behavior.behaviors():
|
for behavior in Behavior.behaviors():
|
||||||
if re.match(behavior['url_regex'], self.url):
|
if re.match(behavior['url_regex'], self.url):
|
||||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": behavior['script']}, id=next(self.command_id)))
|
self.active_behavior = behavior
|
||||||
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
|
||||||
self.websock.send(msg)
|
|
||||||
script_started = True
|
|
||||||
break
|
break
|
||||||
|
|
||||||
if not script_started:
|
if self.active_behavior is None:
|
||||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": Behavior.default_behavior_script()}, id=next(self.command_id)))
|
self.active_behavior = Behavior.default_behavior()
|
||||||
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
|
||||||
self.websock.send(msg)
|
msg = json.dumps(dict(method="Runtime.evaluate", params={"expression": self.active_behavior['script']}, id=next(self.command_id)))
|
||||||
|
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
||||||
|
self.websock.send(msg)
|
||||||
|
|
||||||
|
self.notify_of_activity()
|
||||||
|
|
||||||
def is_finished(self):
|
def is_finished(self):
|
||||||
msg_id = next(self.command_id)
|
msg_id = next(self.command_id)
|
||||||
self.waiting_result_msg_ids.append(msg_id)
|
self.waiting_result_msg_ids.append(msg_id)
|
||||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": "umbraBehaviorFinished()"}, id=msg_id))
|
msg = json.dumps(dict(method="Runtime.evaluate", params={"expression": "umbraBehaviorFinished()"}, id=msg_id))
|
||||||
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
||||||
self.websock.send(msg)
|
self.websock.send(msg)
|
||||||
|
|
||||||
return self.script_finished # XXX and idle_time > behavior_specified_idle_timeout
|
request_idle_timeout_sec = 30
|
||||||
|
if self.active_behavior and 'request_idle_timeout_sec' in self.active_behavior:
|
||||||
|
request_idle_timeout_sec = self.active_behavior['request_idle_timeout_sec']
|
||||||
|
idle_time = time.time() - self.last_activity
|
||||||
|
|
||||||
|
return self.script_finished and idle_time > request_idle_timeout_sec
|
||||||
|
|
||||||
def is_waiting_on_result(self, msg_id):
|
def is_waiting_on_result(self, msg_id):
|
||||||
return msg_id in self.waiting_result_msg_ids
|
return msg_id in self.waiting_result_msg_ids
|
||||||
|
@ -87,4 +102,11 @@ class Behavior:
|
||||||
def notify_of_activity(self):
|
def notify_of_activity(self):
|
||||||
self.last_activity = time.time()
|
self.last_activity = time.time()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
|
||||||
|
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||||
|
logger = logging.getLogger('umbra.behaviors')
|
||||||
|
logger.info("custom behaviors: {}".format(Behavior.behaviors()))
|
||||||
|
logger.info("default behavior: {}".format(Behavior.default_behavior()))
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue