mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-05-02 06:36:20 -04:00
enforce time limits based on time claimed by worker actively brozzling, to avoid problem of stopping crawls that haven't had much chance to crawl, because of cluster busy-ness
This commit is contained in:
parent
3385d727ac
commit
b9640b8a30
4 changed files with 15 additions and 5 deletions
|
@ -135,10 +135,11 @@ class RethinkDbFrontier:
|
|||
|
||||
def _enforce_time_limit(self, site):
|
||||
if (site.time_limit and site.time_limit > 0
|
||||
and site.elapsed() > site.time_limit):
|
||||
and (site.active_brozzling_time or 0) > site.time_limit):
|
||||
self.logger.debug(
|
||||
"site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s",
|
||||
site.time_limit, site.elapsed(), site)
|
||||
"site FINISHED_TIME_LIMIT! time_limit=%s "
|
||||
"active_brozzling_time=%s %s", site.time_limit,
|
||||
site.active_brozzling_time, site)
|
||||
self.finished(site, "FINISHED_TIME_LIMIT")
|
||||
return True
|
||||
else:
|
||||
|
|
|
@ -119,7 +119,15 @@ def new_site(frontier, site):
|
|||
|
||||
class ElapsedMixIn(object):
|
||||
def elapsed(self):
|
||||
'''Returns elapsed crawl time as a float in seconds.'''
|
||||
'''
|
||||
Returns elapsed crawl time as a float in seconds.
|
||||
|
||||
This metric includes all the time that a site was in active rotation,
|
||||
including any time it spent waiting for its turn to be brozzled.
|
||||
|
||||
In contrast `Site.active_brozzling_time` only counts time when a
|
||||
brozzler worker claimed the site and was actively brozzling it.
|
||||
'''
|
||||
dt = 0
|
||||
for ss in self.starts_and_stops[:-1]:
|
||||
dt += (ss['stop'] - ss['start']).total_seconds()
|
||||
|
|
|
@ -499,6 +499,7 @@ class BrozzlerWorker:
|
|||
except:
|
||||
self.logger.critical("unexpected exception", exc_info=True)
|
||||
finally:
|
||||
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
|
||||
self._frontier.disclaim_site(site, page)
|
||||
|
||||
def _brozzle_site_thread_target(self, browser, site):
|
||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b12.dev262',
|
||||
version='1.1b12.dev263',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue