mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 16:16:28 -04:00
Merge branch 'master' into behavior-refactor
This commit is contained in:
commit
5c6184201f
@ -5,6 +5,7 @@ python:
|
||||
- 3.6
|
||||
sudo: required
|
||||
dist: trusty
|
||||
group: deprecated-2017Q2 # https://blog.travis-ci.com/2017-06-21-trusty-updates-2017-Q2-launch
|
||||
before_install:
|
||||
- sudo pip install ansible==2.1.3.0
|
||||
install:
|
||||
|
@ -129,6 +129,13 @@
|
||||
click_css_selector: button#ird3-button-next
|
||||
click_until_hard_timeout: True
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-5389
|
||||
url_regex: '^https?://pitchfork\.com/.*$'
|
||||
behavior_js_template: pitchfork.js
|
||||
- # https://webarchive.jira.com/browse/ARI-5379
|
||||
url_regex: '^https?://pm\.gc\.ca/eng/news.*$'
|
||||
behavior_js_template: pm-ca.js
|
||||
request_idle_timeout_sec: 10
|
||||
- # https://webarchive.jira.com/browse/ARI-4960
|
||||
url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
|
||||
behavior_js_template: simpleclicks.js.j2
|
||||
|
@ -379,7 +379,8 @@ class Browser:
|
||||
self, page_url, ignore_cert_errors=False, extra_headers=None,
|
||||
user_agent=None, behavior_parameters=None,
|
||||
on_request=None, on_response=None, on_screenshot=None,
|
||||
username=None, password=None, hashtags=None):
|
||||
username=None, password=None, hashtags=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False):
|
||||
'''
|
||||
Browses page in browser.
|
||||
|
||||
@ -447,8 +448,12 @@ class Browser:
|
||||
behavior_script = brozzler.behavior_script(
|
||||
page_url, behavior_parameters)
|
||||
self.run_behavior(behavior_script, timeout=900)
|
||||
outlinks = self.extract_outlinks()
|
||||
self.visit_hashtags(page_url, hashtags, outlinks)
|
||||
if skip_extract_outlinks:
|
||||
outlinks = []
|
||||
else:
|
||||
outlinks = self.extract_outlinks()
|
||||
if not skip_visit_hashtags:
|
||||
self.visit_hashtags(page_url, hashtags, outlinks)
|
||||
final_page_url = self.url()
|
||||
return final_page_url, outlinks
|
||||
except brozzler.ReachedLimit:
|
||||
|
@ -154,6 +154,12 @@ def brozzle_page(argv=None):
|
||||
help='use this password to try to log in if a login form is found')
|
||||
arg_parser.add_argument(
|
||||
'--proxy', dest='proxy', default=None, help='http proxy')
|
||||
arg_parser.add_argument(
|
||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
add_common_options(arg_parser, argv)
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
@ -166,7 +172,9 @@ def brozzle_page(argv=None):
|
||||
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
|
||||
'username': args.username, 'password': args.password})
|
||||
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)
|
||||
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
||||
|
||||
def on_screenshot(screenshot_png):
|
||||
OK_CHARS = (string.ascii_letters + string.digits)
|
||||
@ -299,6 +307,12 @@ def brozzler_worker(argv=None):
|
||||
help=(
|
||||
'when needed, choose an available instance of warcprox from '
|
||||
'the rethinkdb service registry'))
|
||||
arg_parser.add_argument(
|
||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
arg_parser.add_argument(
|
||||
'--skip-visit-hashtags', dest='skip_visit_hashtags',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
add_common_options(arg_parser, argv)
|
||||
|
||||
args = arg_parser.parse_args(args=argv[1:])
|
||||
@ -331,7 +345,9 @@ def brozzler_worker(argv=None):
|
||||
worker = brozzler.worker.BrozzlerWorker(
|
||||
frontier, service_registry, max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
||||
warcprox_auto=args.warcprox_auto)
|
||||
warcprox_auto=args.warcprox_auto,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags)
|
||||
|
||||
signal.signal(signal.SIGQUIT, dump_state)
|
||||
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
|
||||
@ -471,7 +487,9 @@ def brozzler_list_sites(argv=None):
|
||||
elif args.jobless:
|
||||
reql = reql.filter(~r.row.has_fields('job_id'))
|
||||
elif args.active:
|
||||
reql = reql.filter({'status': 'ACTIVE'})
|
||||
reql = reql.between(
|
||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
||||
index='sites_last_disclaimed')
|
||||
logging.debug('querying rethinkdb: %s', reql)
|
||||
results = reql.run()
|
||||
if args.yaml:
|
||||
|
@ -135,10 +135,11 @@ class RethinkDbFrontier:
|
||||
|
||||
def _enforce_time_limit(self, site):
|
||||
if (site.time_limit and site.time_limit > 0
|
||||
and site.elapsed() > site.time_limit):
|
||||
and (site.active_brozzling_time or 0) > site.time_limit):
|
||||
self.logger.debug(
|
||||
"site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s",
|
||||
site.time_limit, site.elapsed(), site)
|
||||
"site FINISHED_TIME_LIMIT! time_limit=%s "
|
||||
"active_brozzling_time=%s %s", site.time_limit,
|
||||
site.active_brozzling_time, site)
|
||||
self.finished(site, "FINISHED_TIME_LIMIT")
|
||||
return True
|
||||
else:
|
||||
|
@ -35,7 +35,7 @@ var umbraAboveBelowOrOnScreen = function(e) {
|
||||
}
|
||||
|
||||
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
|
||||
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
|
||||
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a.uiMorePagerPrimary, a[href^="/browse/likes"], *[rel="theater"]';
|
||||
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
|
||||
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
|
||||
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;
|
||||
|
171
brozzler/js-templates/pitchfork.js
Normal file
171
brozzler/js-templates/pitchfork.js
Normal file
@ -0,0 +1,171 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/pm-ca.js - behavior for http://pitchfork.com/festival/chicago/
|
||||
*
|
||||
* Copyright (C) 2014-2017 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraBehavior = {
|
||||
IDLE_TIMEOUT_SEC : 10,
|
||||
idleSince : null,
|
||||
itemsText : "",
|
||||
|
||||
// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
|
||||
// n.b. returns true for elements with visibility:hidden, which occupy
|
||||
// screen real estate but are not visible, or clickable with the ui
|
||||
isVisible : function(elem) {
|
||||
return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
|
||||
},
|
||||
|
||||
intervalFunc : function() {
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var somethingLeftAbove = false;
|
||||
var cssSelector = "button.performer.full-lineup";
|
||||
var closeSelector = ".pmf-artist-modal__close-btn";
|
||||
var clickUntilTimeout = 10;
|
||||
|
||||
var iframes = document.querySelectorAll("iframe");
|
||||
var documents = Array(iframes.length + 1);
|
||||
documents[0] = document;
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
documents[i+1] = iframes[i].contentWindow.document;
|
||||
}
|
||||
|
||||
for (var j = 0; j < documents.length; j++) {
|
||||
var closeTargets = documents[j].querySelectorAll(closeSelector);
|
||||
for (var i = 0; i < closeTargets.length; i++) {
|
||||
if (!this.isVisible(closeTargets[i])) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var where = this.aboveBelowOrOnScreen(closeTargets[i]);
|
||||
|
||||
if (where == 0) {
|
||||
// console.log("clicking on " + clickTargets[i].outerHTML);
|
||||
// do mouse over event on click target
|
||||
// since some urls are requsted only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
closeTargets[i].dispatchEvent(mouseOverEvent);
|
||||
closeTargets[i].click();
|
||||
clickedSomething = true;
|
||||
this.idleSince = null;
|
||||
|
||||
break; //break from closeTargets loop, but not from iframe loop
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
|
||||
var clickTargets = documents[j].querySelectorAll(cssSelector);
|
||||
for (var i = 0; i < clickTargets.length; i++) {
|
||||
if (!this.isVisible(clickTargets[i])) {
|
||||
continue;
|
||||
}
|
||||
if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var where = this.aboveBelowOrOnScreen(clickTargets[i]);
|
||||
|
||||
if (where == 0) {
|
||||
// console.log("clicking on " + clickTargets[i].outerHTML);
|
||||
// do mouse over event on click target
|
||||
// since some urls are requsted only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
clickTargets[i].dispatchEvent(mouseOverEvent);
|
||||
clickTargets[i].click();
|
||||
clickedSomething = true;
|
||||
this.idleSince = null;
|
||||
this.itemsText += clickTargets[i].innerText;
|
||||
|
||||
break; //break from clickTargets loop, but not from iframe loop
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||
window.scrollBy(0, -500);
|
||||
this.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (this.idleSince == null) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.idleSince) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
},
|
||||
|
||||
start : function() {
|
||||
var that = this;
|
||||
this.intervalId = setInterval(function() {
|
||||
that.intervalFunc()
|
||||
}, 500);
|
||||
},
|
||||
|
||||
isFinished : function() {
|
||||
if (this.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - this.idleSince;
|
||||
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
|
||||
clearInterval(this.intervalId);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
},
|
||||
|
||||
aboveBelowOrOnScreen : function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||
return 1; // below
|
||||
} else {
|
||||
return 0; // on screen
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
return umbraBehavior.isFinished()
|
||||
};
|
||||
|
||||
umbraBehavior.start();
|
141
brozzler/js-templates/pm-ca.js
Normal file
141
brozzler/js-templates/pm-ca.js
Normal file
@ -0,0 +1,141 @@
|
||||
/*
|
||||
* brozzler/behaviors.d/pm-ca.js - behavior for http://pm.gc.ca/eng/news
|
||||
*
|
||||
* Copyright (C) 2014-2017 Internet Archive
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
var umbraBehavior = {
|
||||
IDLE_TIMEOUT_SEC : 10,
|
||||
idleSince : null,
|
||||
itemsText : "",
|
||||
|
||||
// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
|
||||
// n.b. returns true for elements with visibility:hidden, which occupy
|
||||
// screen real estate but are not visible, or clickable with the ui
|
||||
isVisible : function(elem) {
|
||||
return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
|
||||
},
|
||||
|
||||
intervalFunc : function() {
|
||||
var clickedSomething = false;
|
||||
var somethingLeftBelow = false;
|
||||
var somethingLeftAbove = false;
|
||||
var cssSelector = "div.teaser";
|
||||
var clickUntilTimeout = 10;
|
||||
|
||||
var iframes = document.querySelectorAll("iframe");
|
||||
var documents = Array(iframes.length + 1);
|
||||
documents[0] = document;
|
||||
|
||||
for (var i = 0; i < iframes.length; i++) {
|
||||
documents[i+1] = iframes[i].contentWindow.document;
|
||||
}
|
||||
|
||||
for (var j = 0; j < documents.length; j++) {
|
||||
var clickTargets = documents[j].querySelectorAll(cssSelector);
|
||||
for (var i = 0; i < clickTargets.length; i++) {
|
||||
if (!this.isVisible(clickTargets[i])) {
|
||||
continue;
|
||||
}
|
||||
if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
|
||||
continue;
|
||||
}
|
||||
|
||||
var where = this.aboveBelowOrOnScreen(clickTargets[i]);
|
||||
|
||||
if (where == 0) {
|
||||
// console.log("clicking on " + clickTargets[i].outerHTML);
|
||||
// do mouse over event on click target
|
||||
// since some urls are requsted only on
|
||||
// this event - see
|
||||
// https://webarchive.jira.com/browse/AITFIVE-451
|
||||
var mouseOverEvent = document.createEvent('Events');
|
||||
mouseOverEvent.initEvent("mouseover",true, false);
|
||||
clickTargets[i].dispatchEvent(mouseOverEvent);
|
||||
clickTargets[i].click();
|
||||
clickedSomething = true;
|
||||
this.idleSince = null;
|
||||
this.itemsText += clickTargets[i].innerText;
|
||||
|
||||
break; //break from clickTargets loop, but not from iframe loop
|
||||
} else if (where > 0) {
|
||||
somethingLeftBelow = true;
|
||||
} else if (where < 0) {
|
||||
somethingLeftAbove = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!clickedSomething) {
|
||||
if (somethingLeftAbove) {
|
||||
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
|
||||
window.scrollBy(0, -500);
|
||||
this.idleSince = null;
|
||||
} else if (somethingLeftBelow) {
|
||||
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
|
||||
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
|
||||
// + document.body.clientHeight);
|
||||
window.scrollBy(0, 200);
|
||||
this.idleSince = null;
|
||||
} else if (this.idleSince == null) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
}
|
||||
|
||||
if (!this.idleSince) {
|
||||
this.idleSince = Date.now();
|
||||
}
|
||||
},
|
||||
|
||||
start : function() {
|
||||
var that = this;
|
||||
this.intervalId = setInterval(function() {
|
||||
that.intervalFunc()
|
||||
}, 500);
|
||||
},
|
||||
|
||||
isFinished : function() {
|
||||
if (this.idleSince != null) {
|
||||
var idleTimeMs = Date.now() - this.idleSince;
|
||||
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
|
||||
clearInterval(this.intervalId);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
},
|
||||
|
||||
aboveBelowOrOnScreen : function(e) {
|
||||
var eTop = e.getBoundingClientRect().top;
|
||||
if (eTop < window.scrollY) {
|
||||
return -1; // above
|
||||
} else if (eTop > window.scrollY + window.innerHeight) {
|
||||
return 1; // below
|
||||
} else {
|
||||
return 0; // on screen
|
||||
}
|
||||
},
|
||||
};
|
||||
|
||||
// Called from outside of this script.
|
||||
var umbraBehaviorFinished = function() {
|
||||
return umbraBehavior.isFinished()
|
||||
};
|
||||
|
||||
umbraBehavior.start();
|
@ -119,7 +119,15 @@ def new_site(frontier, site):
|
||||
|
||||
class ElapsedMixIn(object):
|
||||
def elapsed(self):
|
||||
'''Returns elapsed crawl time as a float in seconds.'''
|
||||
'''
|
||||
Returns elapsed crawl time as a float in seconds.
|
||||
|
||||
This metric includes all the time that a site was in active rotation,
|
||||
including any time it spent waiting for its turn to be brozzled.
|
||||
|
||||
In contrast `Site.active_brozzling_time` only counts time when a
|
||||
brozzler worker claimed the site and was actively brozzling it.
|
||||
'''
|
||||
dt = 0
|
||||
for ss in self.starts_and_stops[:-1]:
|
||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
||||
|
@ -36,6 +36,7 @@ import tempfile
|
||||
import urlcanon
|
||||
from requests.structures import CaseInsensitiveDict
|
||||
import rethinkdb as r
|
||||
import datetime
|
||||
|
||||
class ExtraHeaderAdder(urllib.request.BaseHandler):
|
||||
def __init__(self, extra_headers):
|
||||
@ -102,7 +103,8 @@ class BrozzlerWorker:
|
||||
|
||||
def __init__(
|
||||
self, frontier, service_registry=None, max_browsers=1,
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None):
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False):
|
||||
self._frontier = frontier
|
||||
self._service_registry = service_registry
|
||||
self._max_browsers = max_browsers
|
||||
@ -111,6 +113,8 @@ class BrozzlerWorker:
|
||||
self._proxy = proxy
|
||||
assert not (warcprox_auto and proxy)
|
||||
self._proxy_is_warcprox = None
|
||||
self._skip_extract_outlinks = skip_extract_outlinks
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
|
||||
self._browser_pool = brozzler.browser.BrowserPool(
|
||||
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
|
||||
@ -156,7 +160,23 @@ class BrozzlerWorker:
|
||||
else:
|
||||
return bool(site.proxy or self._warcprox_auto)
|
||||
|
||||
|
||||
def _youtube_dl(self, destdir, site):
|
||||
def ydl_progress(*args, **kwargs):
|
||||
# in case youtube-dl takes a long time, heartbeat site.last_claimed
|
||||
# to prevent another brozzler-worker from claiming the site
|
||||
try:
|
||||
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=7):
|
||||
self.logger.debug(
|
||||
'heartbeating site.last_claimed to prevent another '
|
||||
'brozzler-worker claiming this site id=%r', site.id)
|
||||
site.last_claimed = doublethink.utcnow()
|
||||
site.save()
|
||||
except:
|
||||
self.logger.debug(
|
||||
'problem heartbeating site.last_claimed site id=%r',
|
||||
site.id, exc_info=True)
|
||||
|
||||
ydl_opts = {
|
||||
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
|
||||
"verbose": False,
|
||||
@ -167,6 +187,11 @@ class BrozzlerWorker:
|
||||
"noprogress": True,
|
||||
"nopart": True,
|
||||
"no_color": True,
|
||||
"progress_hooks": [ydl_progress],
|
||||
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
|
||||
# "best: Select the best quality format represented by a single
|
||||
# file with video and audio."
|
||||
"format": "best/bestvideo+bestaudio",
|
||||
}
|
||||
if self._proxy_for(site):
|
||||
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
|
||||
@ -384,7 +409,9 @@ class BrozzlerWorker:
|
||||
username=site.get('username'), password=site.get('password'),
|
||||
user_agent=site.get('user_agent'),
|
||||
on_screenshot=_on_screenshot, on_response=_on_response,
|
||||
hashtags=page.hashtags)
|
||||
hashtags=page.hashtags,
|
||||
skip_extract_outlinks=self._skip_extract_outlinks,
|
||||
skip_visit_hashtags=self._skip_visit_hashtags)
|
||||
if final_page_url != page.url:
|
||||
page.note_redirect(final_page_url)
|
||||
return outlinks
|
||||
@ -425,12 +452,12 @@ class BrozzlerWorker:
|
||||
|
||||
def brozzle_site(self, browser, site):
|
||||
try:
|
||||
start = time.time()
|
||||
page = None
|
||||
self._frontier.honor_stop_request(site)
|
||||
self.logger.info(
|
||||
"brozzling site (proxy=%r) %r",
|
||||
self._proxy_for(site), site)
|
||||
start = time.time()
|
||||
while time.time() - start < 7 * 60:
|
||||
site.refresh()
|
||||
self._frontier.honor_stop_request(site)
|
||||
@ -477,6 +504,8 @@ class BrozzlerWorker:
|
||||
except:
|
||||
self.logger.critical("unexpected exception", exc_info=True)
|
||||
finally:
|
||||
if start:
|
||||
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
|
||||
self._frontier.disclaim_site(site, page)
|
||||
|
||||
def _brozzle_site_thread_target(self, browser, site):
|
||||
|
102
job-conf.rst
102
job-conf.rst
@ -12,7 +12,6 @@ an example
|
||||
|
||||
id: myjob
|
||||
time_limit: 60 # seconds
|
||||
proxy: 127.0.0.1:8000 # point at warcprox for archiving
|
||||
ignore_robots: false
|
||||
warcprox_meta:
|
||||
warc-prefix: job1
|
||||
@ -82,8 +81,8 @@ Notice that:
|
||||
settings reference
|
||||
==================
|
||||
|
||||
id
|
||||
--
|
||||
``id``
|
||||
------
|
||||
+-----------+--------+----------+--------------------------+
|
||||
| scope | type | required | default |
|
||||
+===========+========+==========+==========================+
|
||||
@ -92,8 +91,8 @@ id
|
||||
An arbitrary identifier for this job. Must be unique across this deployment of
|
||||
brozzler.
|
||||
|
||||
seeds
|
||||
-----
|
||||
``seeds``
|
||||
---------
|
||||
+-----------+------------------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+===========+========================+==========+=========+
|
||||
@ -103,8 +102,8 @@ List of seeds. Each item in the list is a dictionary (associative array) which
|
||||
defines the seed. It must specify ``url`` (see below) and can additionally
|
||||
specify any of the settings of scope *seed-level*.
|
||||
|
||||
url
|
||||
---
|
||||
``url``
|
||||
-------
|
||||
+------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+============+========+==========+=========+
|
||||
@ -112,8 +111,11 @@ url
|
||||
+------------+--------+----------+---------+
|
||||
The seed url.
|
||||
|
||||
time_limit
|
||||
----------
|
||||
``metadata``
|
||||
------------
|
||||
|
||||
``time_limit``
|
||||
--------------
|
||||
+-----------------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+========+==========+=========+
|
||||
@ -124,28 +126,18 @@ enforced at the seed level. If a time limit is specified at the top level, it
|
||||
is inherited by each seed as described above, and enforced individually on each
|
||||
seed.
|
||||
|
||||
proxy
|
||||
-----
|
||||
+-----------------------+--------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+========+==========+=========+
|
||||
| seed-level, top-level | string | no | *none* |
|
||||
+-----------------------+--------+----------+---------+
|
||||
HTTP proxy, with the format ``host:port``. Typically configured to point to
|
||||
warcprox for archival crawling.
|
||||
|
||||
ignore_robots
|
||||
-------------
|
||||
+-----------------------+---------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+=========+
|
||||
| seed-level, top-level | boolean | no | false |
|
||||
+-----------------------+---------+----------+---------+
|
||||
``ignore_robots``
|
||||
-----------------
|
||||
+-----------------------+---------+----------+-----------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+===========+
|
||||
| seed-level, top-level | boolean | no | ``false`` |
|
||||
+-----------------------+---------+----------+-----------+
|
||||
If set to ``true``, brozzler will happily crawl pages that would otherwise be
|
||||
blocked by robots.txt rules.
|
||||
|
||||
user_agent
|
||||
----------
|
||||
``user_agent``
|
||||
--------------
|
||||
+-----------------------+---------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+=========+==========+=========+
|
||||
@ -156,13 +148,13 @@ It's good ettiquette to include a project URL with a notice to webmasters that
|
||||
explains why you're crawling, how to block the crawler robots.txt and how to
|
||||
contact the operator if the crawl is causing problems.
|
||||
|
||||
warcprox_meta
|
||||
-------------
|
||||
+-----------------------+------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+=========+
|
||||
| seed-level, top-level | dictionary | no | false |
|
||||
+-----------------------+------------+----------+---------+
|
||||
``warcprox_meta``
|
||||
-----------------
|
||||
+-----------------------+------------+----------+-----------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+===========+
|
||||
| seed-level, top-level | dictionary | no | ``false`` |
|
||||
+-----------------------+------------+----------+-----------+
|
||||
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
|
||||
configured. The value of the Warcprox-Meta header is a json blob. It is used to
|
||||
pass settings and information to warcprox. Warcprox does not forward the header
|
||||
@ -183,11 +175,37 @@ becomes::
|
||||
|
||||
Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
|
||||
|
||||
scope
|
||||
-----
|
||||
+-----------------------+------------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+=========+
|
||||
| seed-level, top-level | dictionary | no | false |
|
||||
+-----------------------+------------+----------+---------+
|
||||
``scope``
|
||||
---------
|
||||
+-----------------------+------------+----------+-----------+
|
||||
| scope | type | required | default |
|
||||
+=======================+============+==========+===========+
|
||||
| seed-level, top-level | dictionary | no | ``false`` |
|
||||
+-----------------------+------------+----------+-----------+
|
||||
Scope rules. *TODO*
|
||||
|
||||
``surt``
|
||||
--------
|
||||
+-------------+--------+----------+---------------------------+
|
||||
| scope | type | required | default |
|
||||
+=============+========+==========+===========================+
|
||||
| scope-level | string | no | *generated from seed url* |
|
||||
+-------------+--------+----------+---------------------------+
|
||||
|
||||
``accepts``
|
||||
-----------
|
||||
+-------------+------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=============+======+==========+=========+
|
||||
| scope-level | list | no | *none* |
|
||||
+-------------+------+----------+---------+
|
||||
|
||||
``blocks``
|
||||
-----------
|
||||
+-------------+------+----------+---------+
|
||||
| scope | type | required | default |
|
||||
+=============+======+==========+=========+
|
||||
| scope-level | list | no | *none* |
|
||||
+-------------+------+----------+---------+
|
||||
|
||||
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b12.dev257',
|
||||
version='1.1b12.dev265',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -238,6 +238,9 @@ def test_resume_job():
|
||||
assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
|
||||
|
||||
def test_time_limit():
|
||||
# XXX test not thoroughly adapted to change in time accounting, since
|
||||
# starts_and_stops is no longer used to enforce time limits
|
||||
|
||||
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
@ -277,9 +280,16 @@ def test_time_limit():
|
||||
site.claimed = True
|
||||
site.save()
|
||||
|
||||
time.sleep(0.1)
|
||||
# time limit not reached yet
|
||||
frontier._enforce_time_limit(site)
|
||||
assert site.status == 'ACTIVE'
|
||||
assert len(site.starts_and_stops) == 2
|
||||
assert site.starts_and_stops[1]['start']
|
||||
assert site.starts_and_stops[1]['stop'] is None
|
||||
|
||||
site.active_brozzling_time = 0.2 # this is why the time limit will be hit
|
||||
|
||||
frontier._enforce_time_limit(site)
|
||||
assert site.status == 'FINISHED_TIME_LIMIT'
|
||||
assert not site.claimed
|
||||
assert len(site.starts_and_stops) == 2
|
||||
|
Loading…
x
Reference in New Issue
Block a user