mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 23:35:54 -04:00
change magic first line of behavior js files to a commented-out json blob, which should include the fields 'url_regex' and 'request_idle_timeout_sec'; behavior.is_finished() incorporates the custom idle timeout into its check; also rename variables in behavior scripts with umbra/UMBRA_ prefix to sort of namespace them; and add "finished" logic to facebook and vimeo behaviors (flickr needs work to support it)
This commit is contained in:
parent
2a9633ad77
commit
a62a07e6b7
@ -1,23 +1,13 @@
|
||||
// {"request_idle_timeout_sec":10}
|
||||
//
|
||||
// vim:set sw=8 et:
|
||||
//
|
||||
// Scrolls to the bottom of the page. That's it at the moment.
|
||||
//
|
||||
|
||||
// STATES = ['NASCENT', 'NEED_SCROLL', 'WAITING', 'FINISHED']
|
||||
|
||||
// var transition = prepareTransition(state);
|
||||
// if (transition.callback) {
|
||||
// newState.callback(state, newState);
|
||||
// }
|
||||
// state = newState;
|
||||
|
||||
// if (state.status === 'NASCENT') {
|
||||
// } else if (state.status == 'NEED_SCROLL') {
|
||||
// } else if (state.status == 'FINISHED') {
|
||||
|
||||
var UMBRA_FINISH_AFTER_IDLE_TIME = 10 * 1000; // ms
|
||||
var umbraState = {'idleSince':null};
|
||||
var umbraFinished = false;
|
||||
var umbraIntervalFunc = function() {
|
||||
// var needToScroll = (window.scrollY + window.innerHeight + 10 < document.body.clientHeight);
|
||||
// var needToScroll = (document.documentElement.scrollTop + document.documentElement.clientHeight < document.documentElement.scrollHeight);
|
||||
var needToScroll = (window.scrollY + window.innerHeight < document.documentElement.scrollHeight);
|
||||
|
||||
// console.log('intervalFunc umbraState.idleSince=' + umbraState.idleSince + ' needToScroll=' + needToScroll + ' window.scrollY=' + window.scrollY + ' window.innerHeight=' + window.innerHeight + ' document.documentElement.scrollHeight=' + document.documentElement.scrollHeight);
|
||||
@ -29,10 +19,15 @@ var umbraIntervalFunc = function() {
|
||||
}
|
||||
}
|
||||
|
||||
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||
// time, then we consider ourselves finished with the page.
|
||||
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
if (umbraState.idleSince != null) {
|
||||
var idleTime = Date.now() - umbraState.idleSince;
|
||||
if (idleTime > UMBRA_FINISH_AFTER_IDLE_TIME) {
|
||||
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,9 @@
|
||||
//^https?://(?:www\.)?facebook\.com/.*$
|
||||
// {"url_regex":"^https?://(?:www\\.)?facebook\\.com/.*$", "request_idle_timeout_sec":30}
|
||||
//
|
||||
// vim:set sw=8 et:
|
||||
//
|
||||
|
||||
var aboveBelowOrOnScreen = function(e) {
|
||||
var umbraAboveBelowOrOnScreen = function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
@ -13,11 +15,11 @@ var aboveBelowOrOnScreen = function(e) {
|
||||
}
|
||||
|
||||
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
|
||||
var THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
|
||||
var alreadyClicked = {};
|
||||
var intervalId;
|
||||
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
|
||||
var umbraAlreadyClicked = {};
|
||||
var umbraState = {'idleSince':null};
|
||||
|
||||
var intervalFunc = function() {
|
||||
var umbraIntervalFunc = function() {
|
||||
var closeButton = document.querySelector('a[title="Close"]');
|
||||
if (closeButton) {
|
||||
console.log("clicking close button " + closeButton.outerHTML);
|
||||
@ -31,15 +33,15 @@ var intervalFunc = function() {
|
||||
return;
|
||||
}
|
||||
|
||||
var thingsToClick = document.querySelectorAll(THINGS_TO_CLICK_SELECTOR);
|
||||
var thingsToClick = document.querySelectorAll(UMBRA_THINGS_TO_CLICK_SELECTOR);
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var missedAbove = 0;
|
||||
|
||||
for (var i = 0; i < thingsToClick.length; i++) {
|
||||
var target = thingsToClick[i];
|
||||
if (!(target in alreadyClicked)) {
|
||||
var where = aboveBelowOrOnScreen(target);
|
||||
if (!(target in umbraAlreadyClicked)) {
|
||||
var where = umbraAboveBelowOrOnScreen(target);
|
||||
if (where == 0) { // on screen
|
||||
// var pos = target.getBoundingClientRect().top;
|
||||
// window.scrollTo(0, target.getBoundingClientRect().top - 100);
|
||||
@ -48,8 +50,9 @@ var intervalFunc = function() {
|
||||
target.click();
|
||||
}
|
||||
target.style.border = '1px solid #0a0';
|
||||
alreadyClicked[target] = true;
|
||||
umbraAlreadyClicked[target] = true;
|
||||
clickedSomething = true;
|
||||
umbraState.idleSince = null;
|
||||
break;
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
@ -67,11 +70,31 @@ var intervalFunc = function() {
|
||||
if (somethingLeftBelow) {
|
||||
console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
umbraState.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight + 10 < document.body.clientHeight) {
|
||||
console.log("scrolling because we're not to the bottom yet document.body.clientHeight=" + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
}
|
||||
umbraState.idleSince = null;
|
||||
} else if (umbraState.idleSince == null) {
|
||||
umbraState.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var intervalId = setInterval(intervalFunc, 200);
|
||||
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||
// time, then we consider ourselves finished with the page.
|
||||
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
if (umbraState.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
var umbraIntervalId = setInterval(umbraIntervalFunc, 200);
|
||||
|
@ -1,17 +1,20 @@
|
||||
//^https?://(?:www\.)?flickr\.com/.*$
|
||||
// {"url_regex":"^https?://(?:www\\.)?flickr\\.com/.*$", "request_idle_timeout_sec":10}
|
||||
//
|
||||
// vim:set sw=8 et:
|
||||
//
|
||||
|
||||
setInterval(function() { window.scrollBy(0,50); }, 100);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
f = a.iterateNext();
|
||||
f.click();},
|
||||
5000);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
setInterval(function() {
|
||||
a = document.evaluate("//a[contains(@class, 'sn-ico-slideshow')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
f = a.iterateNext();
|
||||
f.click();
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
||||
setTimeout(function() {
|
||||
a = document.evaluate("//a[contains(@data-track, 'photo-click')]", document, null, XPathResult.UNORDERED_NODE_ITERATOR_TYPE, null );
|
||||
setInterval(function() {
|
||||
f = a.iterateNext();
|
||||
f.click();
|
||||
}, 5000);
|
||||
}, 5000);
|
||||
|
@ -1,7 +1,27 @@
|
||||
//^https?://(?:www\.)?vimeo.com/.*$
|
||||
// {"url_regex":"^https?://(?:www\\.)?vimeo\\.com/.*$", "request_idle_timeout_sec":10}
|
||||
//
|
||||
// vim:set sw=8 et:
|
||||
//
|
||||
|
||||
var videoElements = document.getElementsByTagName('video');
|
||||
for (var i = 0; i < videoElements.length; i++) {
|
||||
videoElements[i].play();
|
||||
var umbraState = {'idleSince':null};
|
||||
var umbraVideoElements = document.getElementsByTagName('video');
|
||||
for (var i = 0; i < umbraVideoElements.length; i++) {
|
||||
umbraVideoElements[i].play();
|
||||
}
|
||||
umbraState.idleSince = Date.now();
|
||||
|
||||
// If we haven't had anything to do (scrolled, clicked, etc) in this amount of
|
||||
// time, then we consider ourselves finished with the page.
|
||||
var UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC = 10;
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
if (umbraState.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - umbraState.idleSince;
|
||||
if (idleTimeMs / 1000 > UMBRA_USER_ACTION_IDLE_TIMEOUT_SEC) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -1,16 +1,18 @@
|
||||
# vim: set sw=4 et:
|
||||
|
||||
from json import dumps, load
|
||||
import json
|
||||
from itertools import chain
|
||||
import os, re
|
||||
import os
|
||||
import re
|
||||
import logging
|
||||
import time
|
||||
import sys
|
||||
|
||||
class Behavior:
|
||||
logger = logging.getLogger('umbra.behaviors.Behavior')
|
||||
|
||||
_behaviors = None
|
||||
_default_behavior_script = None
|
||||
_default_behavior = None
|
||||
|
||||
@staticmethod
|
||||
def behaviors():
|
||||
@ -20,21 +22,29 @@ class Behavior:
|
||||
Behavior._behaviors = []
|
||||
for file_name in behavior_files:
|
||||
Behavior.logger.debug("reading behavior file {}".format(file_name))
|
||||
lines = open(file_name).readlines()
|
||||
pattern, script = lines[0][2:].strip(), ''.join(lines[1:])
|
||||
Behavior._behaviors.append({'url_regex': pattern, 'script': script, 'file': file_name})
|
||||
Behavior.logger.info("will run behaviors from {} to urls matching {}".format(file_name, pattern))
|
||||
script = open(file_name, encoding='utf-8').read()
|
||||
first_line = script[:script.find('\n')]
|
||||
behavior = json.loads(first_line[2:].strip())
|
||||
behavior['script'] = script
|
||||
behavior['file'] = file_name
|
||||
Behavior._behaviors.append(behavior)
|
||||
Behavior.logger.info("will run behaviors from {} on urls matching {}".format(file_name, behavior['url_regex']))
|
||||
|
||||
return Behavior._behaviors
|
||||
|
||||
@staticmethod
|
||||
def default_behavior_script():
|
||||
if Behavior._default_behavior_script is None:
|
||||
def default_behavior():
|
||||
if Behavior._default_behavior is None:
|
||||
behaviors_directory = os.path.sep.join(__file__.split(os.path.sep)[:-1] + ['behaviors.d'])
|
||||
file_name = os.path.join(behaviors_directory, 'default.js')
|
||||
Behavior.logger.debug("reading default behavior file {}".format(file_name))
|
||||
Behavior._default_behavior_script = open(file_name).read()
|
||||
return Behavior._default_behavior_script
|
||||
script = open(file_name, encoding='utf-8').read()
|
||||
first_line = script[:script.find('\n')]
|
||||
behavior = json.loads(first_line[2:].strip())
|
||||
behavior['script'] = script
|
||||
behavior['file'] = file_name
|
||||
Behavior._default_behavior = behavior
|
||||
return Behavior._default_behavior
|
||||
|
||||
def __init__(self, url, websock, command_id):
|
||||
self.url = url
|
||||
@ -43,32 +53,37 @@ class Behavior:
|
||||
|
||||
self.script_finished = False
|
||||
self.waiting_result_msg_ids = []
|
||||
self.active_behavior = None
|
||||
self.last_activity = time.time()
|
||||
|
||||
def start(self):
|
||||
self.notify_of_activity()
|
||||
|
||||
script_started = False
|
||||
for behavior in Behavior.behaviors():
|
||||
if re.match(behavior['url_regex'], self.url):
|
||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": behavior['script']}, id=next(self.command_id)))
|
||||
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
||||
self.websock.send(msg)
|
||||
script_started = True
|
||||
self.active_behavior = behavior
|
||||
break
|
||||
|
||||
if not script_started:
|
||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": Behavior.default_behavior_script()}, id=next(self.command_id)))
|
||||
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
||||
self.websock.send(msg)
|
||||
if self.active_behavior is None:
|
||||
self.active_behavior = Behavior.default_behavior()
|
||||
|
||||
msg = json.dumps(dict(method="Runtime.evaluate", params={"expression": self.active_behavior['script']}, id=next(self.command_id)))
|
||||
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
||||
self.websock.send(msg)
|
||||
|
||||
self.notify_of_activity()
|
||||
|
||||
def is_finished(self):
|
||||
msg_id = next(self.command_id)
|
||||
self.waiting_result_msg_ids.append(msg_id)
|
||||
msg = dumps(dict(method="Runtime.evaluate", params={"expression": "umbraBehaviorFinished()"}, id=msg_id))
|
||||
msg = json.dumps(dict(method="Runtime.evaluate", params={"expression": "umbraBehaviorFinished()"}, id=msg_id))
|
||||
self.logger.debug('sending message to {}: {}'.format(self.websock, msg))
|
||||
self.websock.send(msg)
|
||||
|
||||
return self.script_finished # XXX and idle_time > behavior_specified_idle_timeout
|
||||
request_idle_timeout_sec = 30
|
||||
if self.active_behavior and 'request_idle_timeout_sec' in self.active_behavior:
|
||||
request_idle_timeout_sec = self.active_behavior['request_idle_timeout_sec']
|
||||
idle_time = time.time() - self.last_activity
|
||||
|
||||
return self.script_finished and idle_time > request_idle_timeout_sec
|
||||
|
||||
def is_waiting_on_result(self, msg_id):
|
||||
return msg_id in self.waiting_result_msg_ids
|
||||
@ -87,4 +102,11 @@ class Behavior:
|
||||
def notify_of_activity(self):
|
||||
self.last_activity = time.time()
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
|
||||
format='%(asctime)s %(process)d %(levelname)s %(threadName)s %(name)s.%(funcName)s(%(filename)s:%(lineno)d) %(message)s')
|
||||
logger = logging.getLogger('umbra.behaviors')
|
||||
logger.info("custom behaviors: {}".format(Behavior.behaviors()))
|
||||
logger.info("default behavior: {}".format(Behavior.default_behavior()))
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user