Merge branch 'behavior-refactor' into qa

This commit is contained in:
Barbara Miller 2017-10-04 20:23:57 -07:00
commit f77144e2dc
5 changed files with 153 additions and 93 deletions

View file

@ -43,10 +43,20 @@
default_parameters: default_parameters:
actions: actions:
- selector: div.teaser, li.pager__item a - selector: div.teaser, li.pager__item a
- # https://webarchive.jira.com/browse/ARI-5389
url_regex: '^https?://pitchfork\.com/.*$'
behavior_js_template: umbraBehavior.js.j2
default_parameters:
actions:
- selector: div.teaser, li.pager__item a
closeSelector: .pmf-artist-modal__close-btn
- -
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$' url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
behavior_js_template: huffpostslides.js behavior_js_template: umbraBehavior.js.j2
request_idle_timeout_sec: 10 default_parameters:
actions:
- selector: .slideshow
- selector: .slideshow-overlay__container__left__nav__next
- -
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$' url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
behavior_js_template: simpleclicks.js.j2 behavior_js_template: simpleclicks.js.j2
@ -96,13 +106,6 @@
click_css_selector: button.playButton.medium click_css_selector: button.playButton.medium
click_until_hard_timeout: False click_until_hard_timeout: False
request_idle_timeout_sec: 10 request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4690
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
behavior_js_template: simpleclicks.js.j2
default_parameters:
click_css_selector: span.load-more-text
click_until_hard_timeout: False
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-5453 / ARI-5391 - # https://webarchive.jira.com/browse/ARI-5453 / ARI-5391
url_regex: '^https?://.*\.wixsite.com/.*$' url_regex: '^https?://.*\.wixsite.com/.*$'
behavior_js_template: simpleclicks.js.j2 behavior_js_template: simpleclicks.js.j2
@ -169,13 +172,6 @@
click_css_selector: button#ird3-button-next click_css_selector: button#ird3-button-next
click_until_hard_timeout: True click_until_hard_timeout: True
request_idle_timeout_sec: 10 request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-5389
url_regex: '^https?://pitchfork\.com/.*$'
behavior_js_template: pitchfork.js
- # https://webarchive.jira.com/browse/ARI-5379
url_regex: '^https?://(?:www\.)?pm\.gc\.ca/.*$'
behavior_js_template: pm-ca.js
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4960 - # https://webarchive.jira.com/browse/ARI-4960
url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$' url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
behavior_js_template: simpleclicks.js.j2 behavior_js_template: simpleclicks.js.j2
@ -192,5 +188,7 @@
request_idle_timeout_sec: 10 request_idle_timeout_sec: 10
- # default fallback behavior - # default fallback behavior
url_regex: '^.*$' url_regex: '^.*$'
request_idle_timeout_sec: 10 behavior_js_template: umbraBehavior.js.j2
behavior_js_template: default.js default_parameters:
actions:
- selector: button.sc-button-play, button.playButton, div.soundItem

View file

@ -381,7 +381,7 @@ class Browser:
on_request=None, on_response=None, on_screenshot=None, on_request=None, on_response=None, on_screenshot=None,
username=None, password=None, hashtags=None, username=None, password=None, hashtags=None,
skip_extract_outlinks=False, skip_visit_hashtags=False, skip_extract_outlinks=False, skip_visit_hashtags=False,
page_timeout=300, behavior_timeout=900): skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
''' '''
Browses page in browser. Browses page in browser.

15
brozzler/cli.py Normal file → Executable file
View file

@ -160,6 +160,9 @@ def brozzle_page(argv=None):
arg_parser.add_argument( arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags', '--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS) action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-youtube-dl', dest='skip_youtube_dl',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv) add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
@ -174,7 +177,8 @@ def brozzle_page(argv=None):
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy, worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
skip_extract_outlinks=args.skip_extract_outlinks, skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags) skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
def on_screenshot(screenshot_png): def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits) OK_CHARS = (string.ascii_letters + string.digits)
@ -190,7 +194,8 @@ def brozzle_page(argv=None):
try: try:
browser.start(proxy=args.proxy) browser.start(proxy=args.proxy)
outlinks = worker.brozzle_page( outlinks = worker.brozzle_page(
browser, site, page, on_screenshot=on_screenshot) browser, site, page, on_screenshot=on_screenshot,
enable_youtube_dl=not args.skip_youtube_dl)
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
except brozzler.ReachedLimit as e: except brozzler.ReachedLimit as e:
logging.error('reached limit %s', e) logging.error('reached limit %s', e)
@ -313,6 +318,9 @@ def brozzler_worker(argv=None):
arg_parser.add_argument( arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags', '--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS) action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-youtube-dl', dest='skip_youtube_dl',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv) add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:]) args = arg_parser.parse_args(args=argv[1:])
@ -347,7 +355,8 @@ def brozzler_worker(argv=None):
chrome_exe=args.chrome_exe, proxy=args.proxy, chrome_exe=args.chrome_exe, proxy=args.proxy,
warcprox_auto=args.warcprox_auto, warcprox_auto=args.warcprox_auto,
skip_extract_outlinks=args.skip_extract_outlinks, skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags) skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, lambda s,f: worker.stop()) signal.signal(signal.SIGTERM, lambda s,f: worker.stop())

View file

@ -1,5 +1,5 @@
/* /*
* brozzler/js-templates/umbrabehavior.js.j2 - a library for umbra/brozzler behaviors * brozzler/js-templates/umbrabehavior.js.j2 - a generalized umbra/brozzler behavior
* *
* Copyright (C) 2017 Internet Archive * Copyright (C) 2017 Internet Archive
* *
@ -25,94 +25,144 @@ var umbraBehavior = {
state : null, state : null,
actions : {{actions|json}}, actions : {{actions|json}},
k : 0,
intervalFunc: function() { intervalFunc: function() {
if (!this.state) { if (!this.state) {
this.state = this.actions.length === 1 ? "simple" : "fancy"; this.state = this.actions.length === 1 ? "simple" : "fancy";
} else if (this.actions.length === k + 1) {
// last action always uses simple block
this.state = "simple";
} }
for (var k = 0; k < this.actions.length; k++) {
var selector = this.actions[k].selector; var k = this.k;
var action = this.actions[k].do ? this.actions[k].do : 'click'; var selector = this.actions[k].selector;
var limit = this.actions[k].limit ? this.actions[k].limit : 0; var action = this.actions[k].do ? this.actions[k].do : 'click';
if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) { var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null;
continue;
} // need to figure out more about how to end more complex actions...
if (limit && !(this.actions[k].alreadyDone)) { // var limit = this.actions[k].limit ? this.actions[k].limit : 0;
this.actions[k].alreadyDone = []; // if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) {
// continue;
// }
// if (limit && !(this.actions[k].alreadyDone)) {
// this.actions[k].alreadyDone = [];
// }
if (this.state === "fancy") {
var moreButton = document.querySelectorAll(selector);
if (moreButton.length > 0) {
console.log("clicking more button");
this.doTarget(moreButton[0],action);
this.k++; // use next action at next run of interval function
return;
} }
if (this.state === "simple") { if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
var didSomething = false; window.scrollBy(0, 200);
var somethingLeftBelow = false; this.idleSince = null;
var somethingLeftAbove = false; return;
}
var iframes = document.querySelectorAll("iframe"); if (this.idleSince === null) {
var documents = Array(iframes.length + 1); console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
documents[0] = document; this.idleSince = Date.now();
return;
} else {
if ((Date.now() - this.idleSince) > 9000) {
console.log("finished loading-thumbs, it appears we have reached the bottom");
this.state = "clicking-first-thumb";
this.idleSince = null;
}
return;
}
}
iframesLength = iframes.length; if (this.state === "simple") {
for (var i = 0; i < iframesLength; i++) { var didSomething = false;
documents[i+1] = iframes[i].contentWindow.document; var somethingLeftBelow = false;
var somethingLeftAbove = false;
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);
documents[0] = document;
iframesLength = iframes.length;
for (var i = 0; i < iframesLength; i++) {
documents[i+1] = iframes[i].contentWindow.document;
}
documentsLength = documents.length;
for (var j = 0; j < documentsLength; j++) {
if (closeSelector) {
var closeTargets = documents[j].querySelectorAll(closeSelector);
if (closeTargets != []) {
for ( var i = 0; i < closeTargets.length; i++) {
this.doTarget(closeTargets[i], 'click');
didSomething = true;
break; // break from closeTargets loop
}
}
} }
documentsLength = documents.length; var doTargets = documents[j].querySelectorAll(selector);
for (var j = 0; j < documentsLength; j++) { if (doTargets == []) {
continue;
}
var doTargets = documents[j].querySelectorAll(selector); doTargetsLength = doTargets.length;
if (doTargets == []) { for ( var i = 0; i < doTargetsLength; i++) {
// if using limits...
// if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) {
// break;
// }
if (this.alreadyDone.indexOf(doTargets[i]) > -1) {
continue; continue;
} }
if (!this.isVisible(doTargets[i])) {
doTargetsLength = doTargets.length; continue;
for ( var i = 0; i < doTargetsLength; i++) {
if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) {
break;
}
if (this.alreadyDone.indexOf(doTargets[i]) > -1) {
continue;
}
if (!this.isVisible(doTargets[i])) {
continue;
}
var where = this.aboveBelowOrOnScreen(doTargets[i]);
if (where == 0) {
this.doTarget(doTargets[i], action);
if (this.actions[k].alreadyDone) {
this.actions[k].alreadyDone.push(doTargets[i]);
}
didSomething = true;
break; // break from doTargets loop, but not from documents loop
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
} }
} var where = this.aboveBelowOrOnScreen(doTargets[i]);
if (!didSomething) { if (where == 0) {
if (somethingLeftAbove) { this.doTarget(doTargets[i], action);
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above"); // if (this.actions[k].alreadyDone) {
window.scrollBy(0, -500); // this.actions[k].alreadyDone.push(doTargets[i]);
this.idleSince = null; // }
} else if (somethingLeftBelow) { didSomething = true;
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight=" break; // break from doTargets loop, but not from documents loop
// + document.body.clientHeight); } else if (where > 0) {
window.scrollBy(0, 200); somethingLeftBelow = true;
this.idleSince = null; } else if (where < 0) {
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) { somethingLeftAbove = true;
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (this.idleSince == null) {
this.idleSince = Date.now();
} }
} }
} }
if (!this.idleSince) { if (!didSomething) {
this.idleSince = Date.now(); if (somethingLeftAbove) {
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
window.scrollBy(0, -500);
this.idleSince = null;
} else if (somethingLeftBelow) {
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (this.idleSince == null) {
this.idleSince = Date.now();
}
} }
} }
if (!this.idleSince) {
this.idleSince = Date.now();
}
}, },
aboveBelowOrOnScreen : function(elem) { aboveBelowOrOnScreen : function(elem) {

View file

@ -105,7 +105,7 @@ class BrozzlerWorker:
self, frontier, service_registry=None, max_browsers=1, self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False, skip_extract_outlinks=False, skip_visit_hashtags=False,
page_timeout=300, behavior_timeout=900): skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
self._frontier = frontier self._frontier = frontier
self._service_registry = service_registry self._service_registry = service_registry
self._max_browsers = max_browsers self._max_browsers = max_browsers
@ -116,6 +116,7 @@ class BrozzlerWorker:
self._proxy_is_warcprox = None self._proxy_is_warcprox = None
self._skip_extract_outlinks = skip_extract_outlinks self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
self._page_timeout = page_timeout self._page_timeout = page_timeout
self._behavior_timeout = behavior_timeout self._behavior_timeout = behavior_timeout
@ -420,6 +421,7 @@ class BrozzlerWorker:
on_request=on_request, hashtags=page.hashtags, on_request=on_request, hashtags=page.hashtags,
skip_extract_outlinks=self._skip_extract_outlinks, skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags, skip_visit_hashtags=self._skip_visit_hashtags,
skip_youtube_dl=self._skip_youtube_dl,
page_timeout=self._page_timeout, page_timeout=self._page_timeout,
behavior_timeout=self._behavior_timeout) behavior_timeout=self._behavior_timeout)
if final_page_url != page.url: if final_page_url != page.url:
@ -485,7 +487,8 @@ class BrozzlerWorker:
page.blocked_by_robots = True page.blocked_by_robots = True
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
else: else:
outlinks = self.brozzle_page(browser, site, page) outlinks = self.brozzle_page(browser, site, page,
enable_youtube_dl=not self._skip_youtube_dl)
self._frontier.completed_page(site, page) self._frontier.completed_page(site, page)
self._frontier.scope_and_schedule_outlinks( self._frontier.scope_and_schedule_outlinks(
site, page, outlinks) site, page, outlinks)