Merge branch 'master' into behavior-refactor

This commit is contained in:
Barbara Miller 2017-07-21 16:24:36 -07:00
commit 5c6184201f
13 changed files with 467 additions and 58 deletions

View File

@ -5,6 +5,7 @@ python:
- 3.6
sudo: required
dist: trusty
group: deprecated-2017Q2 # https://blog.travis-ci.com/2017-06-21-trusty-updates-2017-Q2-launch
before_install:
- sudo pip install ansible==2.1.3.0
install:

View File

@ -129,6 +129,13 @@
click_css_selector: button#ird3-button-next
click_until_hard_timeout: True
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-5389
url_regex: '^https?://pitchfork\.com/.*$'
behavior_js_template: pitchfork.js
- # https://webarchive.jira.com/browse/ARI-5379
url_regex: '^https?://pm\.gc\.ca/eng/news.*$'
behavior_js_template: pm-ca.js
request_idle_timeout_sec: 10
- # https://webarchive.jira.com/browse/ARI-4960
url_regex: '^https?://(?:www\.)?fortstjames.ca/community-events-calendar/$'
behavior_js_template: simpleclicks.js.j2

View File

@ -379,7 +379,8 @@ class Browser:
self, page_url, ignore_cert_errors=False, extra_headers=None,
user_agent=None, behavior_parameters=None,
on_request=None, on_response=None, on_screenshot=None,
username=None, password=None, hashtags=None):
username=None, password=None, hashtags=None,
skip_extract_outlinks=False, skip_visit_hashtags=False):
'''
Browses page in browser.
@ -447,8 +448,12 @@ class Browser:
behavior_script = brozzler.behavior_script(
page_url, behavior_parameters)
self.run_behavior(behavior_script, timeout=900)
outlinks = self.extract_outlinks()
self.visit_hashtags(page_url, hashtags, outlinks)
if skip_extract_outlinks:
outlinks = []
else:
outlinks = self.extract_outlinks()
if not skip_visit_hashtags:
self.visit_hashtags(page_url, hashtags, outlinks)
final_page_url = self.url()
return final_page_url, outlinks
except brozzler.ReachedLimit:

View File

@ -154,6 +154,12 @@ def brozzle_page(argv=None):
help='use this password to try to log in if a login form is found')
arg_parser.add_argument(
'--proxy', dest='proxy', default=None, help='http proxy')
arg_parser.add_argument(
'--skip-extract-outlinks', dest='skip_extract_outlinks',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -166,7 +172,9 @@ def brozzle_page(argv=None):
'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters,
'username': args.username, 'password': args.password})
page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy)
worker = brozzler.BrozzlerWorker(frontier=None, proxy=args.proxy,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
def on_screenshot(screenshot_png):
OK_CHARS = (string.ascii_letters + string.digits)
@ -299,6 +307,12 @@ def brozzler_worker(argv=None):
help=(
'when needed, choose an available instance of warcprox from '
'the rethinkdb service registry'))
arg_parser.add_argument(
'--skip-extract-outlinks', dest='skip_extract_outlinks',
action='store_true', help=argparse.SUPPRESS)
arg_parser.add_argument(
'--skip-visit-hashtags', dest='skip_visit_hashtags',
action='store_true', help=argparse.SUPPRESS)
add_common_options(arg_parser, argv)
args = arg_parser.parse_args(args=argv[1:])
@ -331,7 +345,9 @@ def brozzler_worker(argv=None):
worker = brozzler.worker.BrozzlerWorker(
frontier, service_registry, max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe, proxy=args.proxy,
warcprox_auto=args.warcprox_auto)
warcprox_auto=args.warcprox_auto,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags)
signal.signal(signal.SIGQUIT, dump_state)
signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
@ -471,7 +487,9 @@ def brozzler_list_sites(argv=None):
elif args.jobless:
reql = reql.filter(~r.row.has_fields('job_id'))
elif args.active:
reql = reql.filter({'status': 'ACTIVE'})
reql = reql.between(
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
index='sites_last_disclaimed')
logging.debug('querying rethinkdb: %s', reql)
results = reql.run()
if args.yaml:

View File

@ -135,10 +135,11 @@ class RethinkDbFrontier:
def _enforce_time_limit(self, site):
if (site.time_limit and site.time_limit > 0
and site.elapsed() > site.time_limit):
and (site.active_brozzling_time or 0) > site.time_limit):
self.logger.debug(
"site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s",
site.time_limit, site.elapsed(), site)
"site FINISHED_TIME_LIMIT! time_limit=%s "
"active_brozzling_time=%s %s", site.time_limit,
site.active_brozzling_time, site)
self.finished(site, "FINISHED_TIME_LIMIT")
return True
else:

View File

@ -35,7 +35,7 @@ var umbraAboveBelowOrOnScreen = function(e) {
}
// comments - 'a.UFIPagerLink > span, a.UFIPagerLink, span.UFIReplySocialSentenceLinkText'
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a[href^="/browse/likes"], *[rel="theater"]';
var UMBRA_THINGS_TO_CLICK_SELECTOR = 'a.uiMorePagerPrimary, a[href^="/browse/likes"], *[rel="theater"]';
//div[class="phm pluginLikeboxStream"] = facebook widget embedded in 3rd party pages
var UMBRA_THINGS_TO_SCROLL_SELECTOR = 'div[class="phm pluginLikeboxStream"]';
var NUMBER_FAILED_SCROLL_ATTEMPTS_ON_THING_TO_SCROLL_BEFORE_STOP_SCROLLING = 5;

View File

@ -0,0 +1,171 @@
/*
* brozzler/behaviors.d/pm-ca.js - behavior for http://pitchfork.com/festival/chicago/
*
* Copyright (C) 2014-2017 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraBehavior = {
IDLE_TIMEOUT_SEC : 10,
idleSince : null,
itemsText : "",
// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
// n.b. returns true for elements with visibility:hidden, which occupy
// screen real estate but are not visible, or clickable with the ui
isVisible : function(elem) {
return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
},
intervalFunc : function() {
var clickedSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var cssSelector = "button.performer.full-lineup";
var closeSelector = ".pmf-artist-modal__close-btn";
var clickUntilTimeout = 10;
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);
documents[0] = document;
for (var i = 0; i < iframes.length; i++) {
documents[i+1] = iframes[i].contentWindow.document;
}
for (var j = 0; j < documents.length; j++) {
var closeTargets = documents[j].querySelectorAll(closeSelector);
for (var i = 0; i < closeTargets.length; i++) {
if (!this.isVisible(closeTargets[i])) {
continue;
}
var where = this.aboveBelowOrOnScreen(closeTargets[i]);
if (where == 0) {
// console.log("clicking on " + clickTargets[i].outerHTML);
// do mouse over event on click target
// since some urls are requsted only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
closeTargets[i].dispatchEvent(mouseOverEvent);
closeTargets[i].click();
clickedSomething = true;
this.idleSince = null;
break; //break from closeTargets loop, but not from iframe loop
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
var clickTargets = documents[j].querySelectorAll(cssSelector);
for (var i = 0; i < clickTargets.length; i++) {
if (!this.isVisible(clickTargets[i])) {
continue;
}
if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
continue;
}
var where = this.aboveBelowOrOnScreen(clickTargets[i]);
if (where == 0) {
// console.log("clicking on " + clickTargets[i].outerHTML);
// do mouse over event on click target
// since some urls are requsted only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
clickTargets[i].dispatchEvent(mouseOverEvent);
clickTargets[i].click();
clickedSomething = true;
this.idleSince = null;
this.itemsText += clickTargets[i].innerText;
break; //break from clickTargets loop, but not from iframe loop
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}
if (!clickedSomething) {
if (somethingLeftAbove) {
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
window.scrollBy(0, -500);
this.idleSince = null;
} else if (somethingLeftBelow) {
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (this.idleSince == null) {
this.idleSince = Date.now();
}
}
if (!this.idleSince) {
this.idleSince = Date.now();
}
},
start : function() {
var that = this;
this.intervalId = setInterval(function() {
that.intervalFunc()
}, 500);
},
isFinished : function() {
if (this.idleSince != null) {
var idleTimeMs = Date.now() - this.idleSince;
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
clearInterval(this.intervalId);
return true;
}
}
return false;
},
aboveBelowOrOnScreen : function(e) {
var eTop = e.getBoundingClientRect().top;
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
return 1; // below
} else {
return 0; // on screen
}
},
};
// Called from outside of this script.
var umbraBehaviorFinished = function() {
return umbraBehavior.isFinished()
};
umbraBehavior.start();

View File

@ -0,0 +1,141 @@
/*
* brozzler/behaviors.d/pm-ca.js - behavior for http://pm.gc.ca/eng/news
*
* Copyright (C) 2014-2017 Internet Archive
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
var umbraBehavior = {
IDLE_TIMEOUT_SEC : 10,
idleSince : null,
itemsText : "",
// https://github.com/jquery/jquery/blob/master/src/css/hiddenVisibleSelectors.js
// n.b. returns true for elements with visibility:hidden, which occupy
// screen real estate but are not visible, or clickable with the ui
isVisible : function(elem) {
return !!(elem.offsetWidth || elem.offsetHeight || elem.getClientRects().length);
},
intervalFunc : function() {
var clickedSomething = false;
var somethingLeftBelow = false;
var somethingLeftAbove = false;
var cssSelector = "div.teaser";
var clickUntilTimeout = 10;
var iframes = document.querySelectorAll("iframe");
var documents = Array(iframes.length + 1);
documents[0] = document;
for (var i = 0; i < iframes.length; i++) {
documents[i+1] = iframes[i].contentWindow.document;
}
for (var j = 0; j < documents.length; j++) {
var clickTargets = documents[j].querySelectorAll(cssSelector);
for (var i = 0; i < clickTargets.length; i++) {
if (!this.isVisible(clickTargets[i])) {
continue;
}
if (this.itemsText.indexOf(clickTargets[i].innerText) > -1) {
continue;
}
var where = this.aboveBelowOrOnScreen(clickTargets[i]);
if (where == 0) {
// console.log("clicking on " + clickTargets[i].outerHTML);
// do mouse over event on click target
// since some urls are requsted only on
// this event - see
// https://webarchive.jira.com/browse/AITFIVE-451
var mouseOverEvent = document.createEvent('Events');
mouseOverEvent.initEvent("mouseover",true, false);
clickTargets[i].dispatchEvent(mouseOverEvent);
clickTargets[i].click();
clickedSomething = true;
this.idleSince = null;
this.itemsText += clickTargets[i].innerText;
break; //break from clickTargets loop, but not from iframe loop
} else if (where > 0) {
somethingLeftBelow = true;
} else if (where < 0) {
somethingLeftAbove = true;
}
}
}
if (!clickedSomething) {
if (somethingLeftAbove) {
// console.log("scrolling UP because everything on this screen has been clicked but we missed something above");
window.scrollBy(0, -500);
this.idleSince = null;
} else if (somethingLeftBelow) {
// console.log("scrolling because everything on this screen has been clicked but there's more below document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (window.scrollY + window.innerHeight < document.documentElement.scrollHeight) {
// console.log("scrolling because we're not to the bottom yet document.body.clientHeight="
// + document.body.clientHeight);
window.scrollBy(0, 200);
this.idleSince = null;
} else if (this.idleSince == null) {
this.idleSince = Date.now();
}
}
if (!this.idleSince) {
this.idleSince = Date.now();
}
},
start : function() {
var that = this;
this.intervalId = setInterval(function() {
that.intervalFunc()
}, 500);
},
isFinished : function() {
if (this.idleSince != null) {
var idleTimeMs = Date.now() - this.idleSince;
if (idleTimeMs / 1000 > this.IDLE_TIMEOUT_SEC) {
clearInterval(this.intervalId);
return true;
}
}
return false;
},
aboveBelowOrOnScreen : function(e) {
var eTop = e.getBoundingClientRect().top;
if (eTop < window.scrollY) {
return -1; // above
} else if (eTop > window.scrollY + window.innerHeight) {
return 1; // below
} else {
return 0; // on screen
}
},
};
// Called from outside of this script.
var umbraBehaviorFinished = function() {
return umbraBehavior.isFinished()
};
umbraBehavior.start();

View File

@ -119,7 +119,15 @@ def new_site(frontier, site):
class ElapsedMixIn(object):
def elapsed(self):
'''Returns elapsed crawl time as a float in seconds.'''
'''
Returns elapsed crawl time as a float in seconds.
This metric includes all the time that a site was in active rotation,
including any time it spent waiting for its turn to be brozzled.
In contrast `Site.active_brozzling_time` only counts time when a
brozzler worker claimed the site and was actively brozzling it.
'''
dt = 0
for ss in self.starts_and_stops[:-1]:
dt += (ss['stop'] - ss['start']).total_seconds()

View File

@ -36,6 +36,7 @@ import tempfile
import urlcanon
from requests.structures import CaseInsensitiveDict
import rethinkdb as r
import datetime
class ExtraHeaderAdder(urllib.request.BaseHandler):
def __init__(self, extra_headers):
@ -102,7 +103,8 @@ class BrozzlerWorker:
def __init__(
self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None):
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
skip_extract_outlinks=False, skip_visit_hashtags=False):
self._frontier = frontier
self._service_registry = service_registry
self._max_browsers = max_browsers
@ -111,6 +113,8 @@ class BrozzlerWorker:
self._proxy = proxy
assert not (warcprox_auto and proxy)
self._proxy_is_warcprox = None
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._browser_pool = brozzler.browser.BrowserPool(
max_browsers, chrome_exe=chrome_exe, ignore_cert_errors=True)
@ -156,7 +160,23 @@ class BrozzlerWorker:
else:
return bool(site.proxy or self._warcprox_auto)
def _youtube_dl(self, destdir, site):
def ydl_progress(*args, **kwargs):
# in case youtube-dl takes a long time, heartbeat site.last_claimed
# to prevent another brozzler-worker from claiming the site
try:
if site.rr and doublethink.utcnow() - site.last_claimed > datetime.timedelta(minutes=7):
self.logger.debug(
'heartbeating site.last_claimed to prevent another '
'brozzler-worker claiming this site id=%r', site.id)
site.last_claimed = doublethink.utcnow()
site.save()
except:
self.logger.debug(
'problem heartbeating site.last_claimed site id=%r',
site.id, exc_info=True)
ydl_opts = {
"outtmpl": "{}/ydl%(autonumber)s.out".format(destdir),
"verbose": False,
@ -167,6 +187,11 @@ class BrozzlerWorker:
"noprogress": True,
"nopart": True,
"no_color": True,
"progress_hooks": [ydl_progress],
# https://github.com/rg3/youtube-dl/blob/master/README.md#format-selection
# "best: Select the best quality format represented by a single
# file with video and audio."
"format": "best/bestvideo+bestaudio",
}
if self._proxy_for(site):
ydl_opts["proxy"] = "http://{}".format(self._proxy_for(site))
@ -384,7 +409,9 @@ class BrozzlerWorker:
username=site.get('username'), password=site.get('password'),
user_agent=site.get('user_agent'),
on_screenshot=_on_screenshot, on_response=_on_response,
hashtags=page.hashtags)
hashtags=page.hashtags,
skip_extract_outlinks=self._skip_extract_outlinks,
skip_visit_hashtags=self._skip_visit_hashtags)
if final_page_url != page.url:
page.note_redirect(final_page_url)
return outlinks
@ -425,12 +452,12 @@ class BrozzlerWorker:
def brozzle_site(self, browser, site):
try:
start = time.time()
page = None
self._frontier.honor_stop_request(site)
self.logger.info(
"brozzling site (proxy=%r) %r",
self._proxy_for(site), site)
start = time.time()
while time.time() - start < 7 * 60:
site.refresh()
self._frontier.honor_stop_request(site)
@ -477,6 +504,8 @@ class BrozzlerWorker:
except:
self.logger.critical("unexpected exception", exc_info=True)
finally:
if start:
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
self._frontier.disclaim_site(site, page)
def _brozzle_site_thread_target(self, browser, site):

View File

@ -12,7 +12,6 @@ an example
id: myjob
time_limit: 60 # seconds
proxy: 127.0.0.1:8000 # point at warcprox for archiving
ignore_robots: false
warcprox_meta:
warc-prefix: job1
@ -82,8 +81,8 @@ Notice that:
settings reference
==================
id
--
``id``
------
+-----------+--------+----------+--------------------------+
| scope | type | required | default |
+===========+========+==========+==========================+
@ -92,8 +91,8 @@ id
An arbitrary identifier for this job. Must be unique across this deployment of
brozzler.
seeds
-----
``seeds``
---------
+-----------+------------------------+----------+---------+
| scope | type | required | default |
+===========+========================+==========+=========+
@ -103,8 +102,8 @@ List of seeds. Each item in the list is a dictionary (associative array) which
defines the seed. It must specify ``url`` (see below) and can additionally
specify any of the settings of scope *seed-level*.
url
---
``url``
-------
+------------+--------+----------+---------+
| scope | type | required | default |
+============+========+==========+=========+
@ -112,8 +111,11 @@ url
+------------+--------+----------+---------+
The seed url.
time_limit
----------
``metadata``
------------
``time_limit``
--------------
+-----------------------+--------+----------+---------+
| scope | type | required | default |
+=======================+========+==========+=========+
@ -124,28 +126,18 @@ enforced at the seed level. If a time limit is specified at the top level, it
is inherited by each seed as described above, and enforced individually on each
seed.
proxy
-----
+-----------------------+--------+----------+---------+
| scope | type | required | default |
+=======================+========+==========+=========+
| seed-level, top-level | string | no | *none* |
+-----------------------+--------+----------+---------+
HTTP proxy, with the format ``host:port``. Typically configured to point to
warcprox for archival crawling.
ignore_robots
-------------
+-----------------------+---------+----------+---------+
| scope | type | required | default |
+=======================+=========+==========+=========+
| seed-level, top-level | boolean | no | false |
+-----------------------+---------+----------+---------+
``ignore_robots``
-----------------
+-----------------------+---------+----------+-----------+
| scope | type | required | default |
+=======================+=========+==========+===========+
| seed-level, top-level | boolean | no | ``false`` |
+-----------------------+---------+----------+-----------+
If set to ``true``, brozzler will happily crawl pages that would otherwise be
blocked by robots.txt rules.
user_agent
----------
``user_agent``
--------------
+-----------------------+---------+----------+---------+
| scope | type | required | default |
+=======================+=========+==========+=========+
@ -156,13 +148,13 @@ It's good ettiquette to include a project URL with a notice to webmasters that
explains why you're crawling, how to block the crawler robots.txt and how to
contact the operator if the crawl is causing problems.
warcprox_meta
-------------
+-----------------------+------------+----------+---------+
| scope | type | required | default |
+=======================+============+==========+=========+
| seed-level, top-level | dictionary | no | false |
+-----------------------+------------+----------+---------+
``warcprox_meta``
-----------------
+-----------------------+------------+----------+-----------+
| scope | type | required | default |
+=======================+============+==========+===========+
| seed-level, top-level | dictionary | no | ``false`` |
+-----------------------+------------+----------+-----------+
Specifies the Warcprox-Meta header to send with every request, if ``proxy`` is
configured. The value of the Warcprox-Meta header is a json blob. It is used to
pass settings and information to warcprox. Warcprox does not forward the header
@ -183,11 +175,37 @@ becomes::
Warcprox-Meta: {"warc-prefix":"job1-seed1","stats":{"buckets":["job1-stats","job1-seed1-stats"]}}
scope
-----
+-----------------------+------------+----------+---------+
| scope | type | required | default |
+=======================+============+==========+=========+
| seed-level, top-level | dictionary | no | false |
+-----------------------+------------+----------+---------+
``scope``
---------
+-----------------------+------------+----------+-----------+
| scope | type | required | default |
+=======================+============+==========+===========+
| seed-level, top-level | dictionary | no | ``false`` |
+-----------------------+------------+----------+-----------+
Scope rules. *TODO*
``surt``
--------
+-------------+--------+----------+---------------------------+
| scope | type | required | default |
+=============+========+==========+===========================+
| scope-level | string | no | *generated from seed url* |
+-------------+--------+----------+---------------------------+
``accepts``
-----------
+-------------+------+----------+---------+
| scope | type | required | default |
+=============+======+==========+=========+
| scope-level | list | no | *none* |
+-------------+------+----------+---------+
``blocks``
-----------
+-------------+------+----------+---------+
| scope | type | required | default |
+=============+======+==========+=========+
| scope-level | list | no | *none* |
+-------------+------+----------+---------+

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b12.dev257',
version='1.1b12.dev265',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -238,6 +238,9 @@ def test_resume_job():
assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
def test_time_limit():
# XXX test not thoroughly adapted to change in time accounting, since
# starts_and_stops is no longer used to enforce time limits
# vagrant brozzler-worker isn't configured to look at the "ignoreme" db
rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
@ -277,9 +280,16 @@ def test_time_limit():
site.claimed = True
site.save()
time.sleep(0.1)
# time limit not reached yet
frontier._enforce_time_limit(site)
assert site.status == 'ACTIVE'
assert len(site.starts_and_stops) == 2
assert site.starts_and_stops[1]['start']
assert site.starts_and_stops[1]['stop'] is None
site.active_brozzling_time = 0.2 # this is why the time limit will be hit
frontier._enforce_time_limit(site)
assert site.status == 'FINISHED_TIME_LIMIT'
assert not site.claimed
assert len(site.starts_and_stops) == 2