mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-09-21 05:04:40 -04:00
Merge branch 'behavior-refactor' into qa
This commit is contained in:
commit
f77144e2dc
5 changed files with 153 additions and 93 deletions
|
@ -43,10 +43,20 @@
|
||||||
default_parameters:
|
default_parameters:
|
||||||
actions:
|
actions:
|
||||||
- selector: div.teaser, li.pager__item a
|
- selector: div.teaser, li.pager__item a
|
||||||
|
- # https://webarchive.jira.com/browse/ARI-5389
|
||||||
|
url_regex: '^https?://pitchfork\.com/.*$'
|
||||||
|
behavior_js_template: umbraBehavior.js.j2
|
||||||
|
default_parameters:
|
||||||
|
actions:
|
||||||
|
- selector: div.teaser, li.pager__item a
|
||||||
|
closeSelector: .pmf-artist-modal__close-btn
|
||||||
-
|
-
|
||||||
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
|
url_regex: '^https?://(?:www\.)?huffingtonpost\.com/.*$'
|
||||||
behavior_js_template: huffpostslides.js
|
behavior_js_template: umbraBehavior.js.j2
|
||||||
request_idle_timeout_sec: 10
|
default_parameters:
|
||||||
|
actions:
|
||||||
|
- selector: .slideshow
|
||||||
|
- selector: .slideshow-overlay__container__left__nav__next
|
||||||
-
|
-
|
||||||
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
|
url_regex: '^https?://(?:www\.)?brooklynmuseum\.org/exhibitions/.*$'
|
||||||
behavior_js_template: simpleclicks.js.j2
|
behavior_js_template: simpleclicks.js.j2
|
||||||
|
@ -96,13 +106,6 @@
|
||||||
click_css_selector: button.playButton.medium
|
click_css_selector: button.playButton.medium
|
||||||
click_until_hard_timeout: False
|
click_until_hard_timeout: False
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # https://webarchive.jira.com/browse/ARI-4690
|
|
||||||
url_regex: '^https?://(?:www\.)?youtube.com/.*$'
|
|
||||||
behavior_js_template: simpleclicks.js.j2
|
|
||||||
default_parameters:
|
|
||||||
click_css_selector: span.load-more-text
|
|
||||||
click_until_hard_timeout: False
|
|
||||||
request_idle_timeout_sec: 10
|
|
||||||
- # https://webarchive.jira.com/browse/ARI-5453 / ARI-5391
|
- # https://webarchive.jira.com/browse/ARI-5453 / ARI-5391
|
||||||
url_regex: '^https?://.*\.wixsite.com/.*$'
|
url_regex: '^https?://.*\.wixsite.com/.*$'
|
||||||
behavior_js_template: simpleclicks.js.j2
|
behavior_js_template: simpleclicks.js.j2
|
||||||
|
@ -169,13 +172,6 @@
|
||||||
click_css_selector: button#ird3-button-next
|
click_css_selector: button#ird3-button-next
|
||||||
click_until_hard_timeout: True
|
click_until_hard_timeout: True
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # https://webarchive.jira.com/browse/ARI-5389
|
|
||||||
url_regex: '^https?://pitchfork\.com/.*$'
|
|
||||||
behavior_js_template: pitchfork.js
|
|
||||||
- # https://webarchive.jira.com/browse/ARI-5379
|
|
||||||
url_regex: '^https?://(?:www\.)?pm\.gc\.ca/.*$'
|
|
||||||
behavior_js_template: pm-ca.js
|
|
||||||
request_idle_timeout_sec: 10
|
|
||||||
- # https://webarchive.jira.com/browse/ARI-4960
|
- # https://webarchive.jira.com/browse/ARI-4960
|
||||||
url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
|
url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
|
||||||
behavior_js_template: simpleclicks.js.j2
|
behavior_js_template: simpleclicks.js.j2
|
||||||
|
@ -192,5 +188,7 @@
|
||||||
request_idle_timeout_sec: 10
|
request_idle_timeout_sec: 10
|
||||||
- # default fallback behavior
|
- # default fallback behavior
|
||||||
url_regex: '^.*$'
|
url_regex: '^.*$'
|
||||||
request_idle_timeout_sec: 10
|
behavior_js_template: umbraBehavior.js.j2
|
||||||
behavior_js_template: default.js
|
default_parameters:
|
||||||
|
actions:
|
||||||
|
- selector: button.sc-button-play, button.playButton, div.soundItem
|
||||||
|
|
|
@ -381,7 +381,7 @@ class Browser:
|
||||||
on_request=None, on_response=None, on_screenshot=None,
|
on_request=None, on_response=None, on_screenshot=None,
|
||||||
username=None, password=None, hashtags=None,
|
username=None, password=None, hashtags=None,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||||
page_timeout=300, behavior_timeout=900):
|
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
||||||
'''
|
'''
|
||||||
Browses page in browser.
|
Browses page in browser.
|
||||||
|
|
||||||
|
|
15
brozzler/cli.py
Normal file → Executable file
15
brozzler/cli.py
Normal file → Executable file
|
@ -160,6 +160,9 @@ def brozzle_page(argv=None):
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||||
action='store_true', help=argparse.SUPPRESS)
|
action='store_true', help=argparse.SUPPRESS)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--skip-youtube-dl', dest='skip_youtube_dl',
|
||||||
|
action='store_true', help=argparse.SUPPRESS)
|
||||||
add_common_options(arg_parser, argv)
|
add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
@ -174,7 +177,8 @@ def brozzle_page(argv=None):
|
||||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
|
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
|
||||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
|
skip_youtube_dl=args.skip_youtube_dl)
|
||||||
|
|
||||||
def on_screenshot(screenshot_png):
|
def on_screenshot(screenshot_png):
|
||||||
OK_CHARS = (string.ascii_letters + string.digits)
|
OK_CHARS = (string.ascii_letters + string.digits)
|
||||||
|
@ -190,7 +194,8 @@ def brozzle_page(argv=None):
|
||||||
try:
|
try:
|
||||||
browser.start(proxy=args.proxy)
|
browser.start(proxy=args.proxy)
|
||||||
outlinks = worker.brozzle_page(
|
outlinks = worker.brozzle_page(
|
||||||
browser, site, page, on_screenshot=on_screenshot)
|
browser, site, page, on_screenshot=on_screenshot,
|
||||||
|
enable_youtube_dl=not args.skip_youtube_dl)
|
||||||
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
|
logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
|
||||||
except brozzler.ReachedLimit as e:
|
except brozzler.ReachedLimit as e:
|
||||||
logging.error('reached limit %s', e)
|
logging.error('reached limit %s', e)
|
||||||
|
@ -313,6 +318,9 @@ def brozzler_worker(argv=None):
|
||||||
arg_parser.add_argument(
|
arg_parser.add_argument(
|
||||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||||
action='store_true', help=argparse.SUPPRESS)
|
action='store_true', help=argparse.SUPPRESS)
|
||||||
|
arg_parser.add_argument(
|
||||||
|
'--skip-youtube-dl', dest='skip_youtube_dl',
|
||||||
|
action='store_true', help=argparse.SUPPRESS)
|
||||||
add_common_options(arg_parser, argv)
|
add_common_options(arg_parser, argv)
|
||||||
|
|
||||||
args = arg_parser.parse_args(args=argv[1:])
|
args = arg_parser.parse_args(args=argv[1:])
|
||||||
|
@ -347,7 +355,8 @@ def brozzler_worker(argv=None):
|
||||||
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
||||||
warcprox_auto=args.warcprox_auto,
|
warcprox_auto=args.warcprox_auto,
|
||||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||||
|
skip_youtube_dl=args.skip_youtube_dl)
|
||||||
|
|
||||||
signal.signal(signal.SIGQUIT, dump_state)
|
signal.signal(signal.SIGQUIT, dump_state)
|
||||||
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* brozzler/js-templates/umbrabehavior.js.j2 - a library for umbra/brozzler behaviors
|
* brozzler/js-templates/umbrabehavior.js.j2 - a generalized umbra/brozzler behavior
|
||||||
*
|
*
|
||||||
* Copyright (C) 2017 Internet Archive
|
* Copyright (C) 2017 Internet Archive
|
||||||
*
|
*
|
||||||
|
@ -25,94 +25,144 @@ var umbraBehavior = {
|
||||||
state : null,
|
state : null,
|
||||||
|
|
||||||
actions : {{actions|json}},
|
actions : {{actions|json}},
|
||||||
|
k : 0,
|
||||||
|
|
||||||
intervalFunc: function() {
|
intervalFunc: function() {
|
||||||
if (!this.state) {
|
if (!this.state) {
|
||||||
this.state = this.actions.length === 1 ? "simple" : "fancy";
|
this.state = this.actions.length === 1 ? "simple" : "fancy";
|
||||||
|
} else if (this.actions.length === k + 1) {
|
||||||
|
// last action always uses simple block
|
||||||
|
this.state = "simple";
|
||||||
}
|
}
|
||||||
for (var k = 0; k < this.actions.length; k++) {
|
|
||||||
var selector = this.actions[k].selector;
|
var k = this.k;
|
||||||
var action = this.actions[k].do ? this.actions[k].do : 'click';
|
var selector = this.actions[k].selector;
|
||||||
var limit = this.actions[k].limit ? this.actions[k].limit : 0;
|
var action = this.actions[k].do ? this.actions[k].do : 'click';
|
||||||
if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) {
|
var closeSelector = this.actions[k].closeSelector ? this.actions[k].closeSelector : null;
|
||||||
continue;
|
|
||||||
}
|
// need to figure out more about how to end more complex actions...
|
||||||
if (limit && !(this.actions[k].alreadyDone)) {
|
// var limit = this.actions[k].limit ? this.actions[k].limit : 0;
|
||||||
this.actions[k].alreadyDone = [];
|
// if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) {
|
||||||
|
// continue;
|
||||||
|
// }
|
||||||
|
// if (limit && !(this.actions[k].alreadyDone)) {
|
||||||
|
// this.actions[k].alreadyDone = [];
|
||||||
|
// }
|
||||||
|
|
||||||
|
if (this.state === "fancy") {
|
||||||
|
|
||||||
|
var moreButton = document.querySelectorAll(selector);
|
||||||
|
if (moreButton.length > 0) {
|
||||||
|
console.log("clicking more button");
|
||||||
|
this.doTarget(moreButton[0],action);
|
||||||
|
this.k++; // use next action at next run of interval function
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.state === "simple") {
|
if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||||
var didSomething = false;
|
window.scrollBy(0, 200);
|
||||||
var somethingLeftBelow = false;
|
this.idleSince = null;
|
||||||
var somethingLeftAbove = false;
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
var iframes = document.querySelectorAll("iframe");
|
if (this.idleSince === null) {
|
||||||
var documents = Array(iframes.length + 1);
|
console.log("nothing to do at the moment, might be waiting for something to load, setting this.idleSince=Date.now()");
|
||||||
documents[0] = document;
|
this.idleSince = Date.now();
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
if ((Date.now() - this.idleSince) > 9000) {
|
||||||
|
console.log("finished loading-thumbs, it appears we have reached the bottom");
|
||||||
|
this.state = "clicking-first-thumb";
|
||||||
|
this.idleSince = null;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
iframesLength = iframes.length;
|
if (this.state === "simple") {
|
||||||
for (var i = 0; i < iframesLength; i++) {
|
var didSomething = false;
|
||||||
documents[i+1] = iframes[i].contentWindow.document;
|
var somethingLeftBelow = false;
|
||||||
|
var somethingLeftAbove = false;
|
||||||
|
|
||||||
|
var iframes = document.querySelectorAll("iframe");
|
||||||
|
var documents = Array(iframes.length + 1);
|
||||||
|
documents[0] = document;
|
||||||
|
|
||||||
|
iframesLength = iframes.length;
|
||||||
|
for (var i = 0; i < iframesLength; i++) {
|
||||||
|
documents[i+1] = iframes[i].contentWindow.document;
|
||||||
|
}
|
||||||
|
|
||||||
|
documentsLength = documents.length;
|
||||||
|
for (var j = 0; j < documentsLength; j++) {
|
||||||
|
|
||||||
|
if (closeSelector) {
|
||||||
|
var closeTargets = documents[j].querySelectorAll(closeSelector);
|
||||||
|
if (closeTargets != []) {
|
||||||
|
for ( var i = 0; i < closeTargets.length; i++) {
|
||||||
|
this.doTarget(closeTargets[i], 'click');
|
||||||
|
didSomething = true;
|
||||||
|
break; // break from closeTargets loop
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
documentsLength = documents.length;
|
var doTargets = documents[j].querySelectorAll(selector);
|
||||||
for (var j = 0; j < documentsLength; j++) {
|
if (doTargets == []) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
var doTargets = documents[j].querySelectorAll(selector);
|
doTargetsLength = doTargets.length;
|
||||||
if (doTargets == []) {
|
for ( var i = 0; i < doTargetsLength; i++) {
|
||||||
|
// if using limits...
|
||||||
|
// if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) {
|
||||||
|
// break;
|
||||||
|
// }
|
||||||
|
|
||||||
|
if (this.alreadyDone.indexOf(doTargets[i]) > -1) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (!this.isVisible(doTargets[i])) {
|
||||||
doTargetsLength = doTargets.length;
|
continue;
|
||||||
for ( var i = 0; i < doTargetsLength; i++) {
|
|
||||||
if (limit && this.actions[k].alreadyDone && this.actions[k].alreadyDone.length >= limit) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (this.alreadyDone.indexOf(doTargets[i]) > -1) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!this.isVisible(doTargets[i])) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
var where = this.aboveBelowOrOnScreen(doTargets[i]);
|
|
||||||
if (where == 0) {
|
|
||||||
this.doTarget(doTargets[i], action);
|
|
||||||
if (this.actions[k].alreadyDone) {
|
|
||||||
this.actions[k].alreadyDone.push(doTargets[i]);
|
|
||||||
}
|
|
||||||
didSomething = true;
|
|
||||||
break; // break from doTargets loop, but not from documents loop
|
|
||||||
} else if (where > 0) {
|
|
||||||
somethingLeftBelow = true;
|
|
||||||
} else if (where < 0) {
|
|
||||||
somethingLeftAbove = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
var where = this.aboveBelowOrOnScreen(doTargets[i]);
|
||||||
if (!didSomething) {
|
if (where == 0) {
|
||||||
if (somethingLeftAbove) {
|
this.doTarget(doTargets[i], action);
|
||||||
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
// if (this.actions[k].alreadyDone) {
|
||||||
window.scrollBy(0, -500);
|
// this.actions[k].alreadyDone.push(doTargets[i]);
|
||||||
this.idleSince = null;
|
// }
|
||||||
} else if (somethingLeftBelow) {
|
didSomething = true;
|
||||||
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
break; // break from doTargets loop, but not from documents loop
|
||||||
// + document.body.clientHeight);
|
} else if (where > 0) {
|
||||||
window.scrollBy(0, 200);
|
somethingLeftBelow = true;
|
||||||
this.idleSince = null;
|
} else if (where < 0) {
|
||||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
somethingLeftAbove = true;
|
||||||
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
|
||||||
// + document.body.clientHeight);
|
|
||||||
window.scrollBy(0, 200);
|
|
||||||
this.idleSince = null;
|
|
||||||
} else if (this.idleSince == null) {
|
|
||||||
this.idleSince = Date.now();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!this.idleSince) {
|
if (!didSomething) {
|
||||||
this.idleSince = Date.now();
|
if (somethingLeftAbove) {
|
||||||
|
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||||
|
window.scrollBy(0, -500);
|
||||||
|
this.idleSince = null;
|
||||||
|
} else if (somethingLeftBelow) {
|
||||||
|
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
||||||
|
// + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 200);
|
||||||
|
this.idleSince = null;
|
||||||
|
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||||
|
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||||
|
// + document.body.clientHeight);
|
||||||
|
window.scrollBy(0, 200);
|
||||||
|
this.idleSince = null;
|
||||||
|
} else if (this.idleSince == null) {
|
||||||
|
this.idleSince = Date.now();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!this.idleSince) {
|
||||||
|
this.idleSince = Date.now();
|
||||||
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
aboveBelowOrOnScreen : function(elem) {
|
aboveBelowOrOnScreen : function(elem) {
|
||||||
|
|
|
@ -105,7 +105,7 @@ class BrozzlerWorker:
|
||||||
self, frontier, service_registry=None, max_browsers=1,
|
self, frontier, service_registry=None, max_browsers=1,
|
||||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||||
page_timeout=300, behavior_timeout=900):
|
skip_youtube_dl=False, page_timeout=300, behavior_timeout=900):
|
||||||
self._frontier = frontier
|
self._frontier = frontier
|
||||||
self._service_registry = service_registry
|
self._service_registry = service_registry
|
||||||
self._max_browsers = max_browsers
|
self._max_browsers = max_browsers
|
||||||
|
@ -116,6 +116,7 @@ class BrozzlerWorker:
|
||||||
self._proxy_is_warcprox = None
|
self._proxy_is_warcprox = None
|
||||||
self._skip_extract_outlinks = skip_extract_outlinks
|
self._skip_extract_outlinks = skip_extract_outlinks
|
||||||
self._skip_visit_hashtags = skip_visit_hashtags
|
self._skip_visit_hashtags = skip_visit_hashtags
|
||||||
|
self._skip_youtube_dl = skip_youtube_dl
|
||||||
self._page_timeout = page_timeout
|
self._page_timeout = page_timeout
|
||||||
self._behavior_timeout = behavior_timeout
|
self._behavior_timeout = behavior_timeout
|
||||||
|
|
||||||
|
@ -420,6 +421,7 @@ class BrozzlerWorker:
|
||||||
on_request=on_request, hashtags=page.hashtags,
|
on_request=on_request, hashtags=page.hashtags,
|
||||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||||
skip_visit_hashtags=self._skip_visit_hashtags,
|
skip_visit_hashtags=self._skip_visit_hashtags,
|
||||||
|
skip_youtube_dl=self._skip_youtube_dl,
|
||||||
page_timeout=self._page_timeout,
|
page_timeout=self._page_timeout,
|
||||||
behavior_timeout=self._behavior_timeout)
|
behavior_timeout=self._behavior_timeout)
|
||||||
if final_page_url != page.url:
|
if final_page_url != page.url:
|
||||||
|
@ -485,7 +487,8 @@ class BrozzlerWorker:
|
||||||
page.blocked_by_robots = True
|
page.blocked_by_robots = True
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
else:
|
else:
|
||||||
outlinks = self.brozzle_page(browser, site, page)
|
outlinks = self.brozzle_page(browser, site, page,
|
||||||
|
enable_youtube_dl=not self._skip_youtube_dl)
|
||||||
self._frontier.completed_page(site, page)
|
self._frontier.completed_page(site, page)
|
||||||
self._frontier.scope_and_schedule_outlinks(
|
self._frontier.scope_and_schedule_outlinks(
|
||||||
site, page, outlinks)
|
site, page, outlinks)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue