enforce time limits based on time claimed by worker actively brozzling, to avoid problem of stopping crawls that haven't had much chance to crawl, because of cluster busy-ness

This commit is contained in:
Noah Levitt 2017-06-26 18:00:32 -07:00
parent 3385d727ac
commit b9640b8a30
4 changed files with 15 additions and 5 deletions

View file

@ -135,10 +135,11 @@ class RethinkDbFrontier:
def _enforce_time_limit(self, site): def _enforce_time_limit(self, site):
if (site.time_limit and site.time_limit > 0 if (site.time_limit and site.time_limit > 0
and site.elapsed() > site.time_limit): and (site.active_brozzling_time or 0) > site.time_limit):
self.logger.debug( self.logger.debug(
"site FINISHED_TIME_LIMIT! time_limit=%s elapsed=%s %s", "site FINISHED_TIME_LIMIT! time_limit=%s "
site.time_limit, site.elapsed(), site) "active_brozzling_time=%s %s", site.time_limit,
site.active_brozzling_time, site)
self.finished(site, "FINISHED_TIME_LIMIT") self.finished(site, "FINISHED_TIME_LIMIT")
return True return True
else: else:

View file

@ -119,7 +119,15 @@ def new_site(frontier, site):
class ElapsedMixIn(object): class ElapsedMixIn(object):
def elapsed(self): def elapsed(self):
'''Returns elapsed crawl time as a float in seconds.''' '''
Returns elapsed crawl time as a float in seconds.
This metric includes all the time that a site was in active rotation,
including any time it spent waiting for its turn to be brozzled.
In contrast `Site.active_brozzling_time` only counts time when a
brozzler worker claimed the site and was actively brozzling it.
'''
dt = 0 dt = 0
for ss in self.starts_and_stops[:-1]: for ss in self.starts_and_stops[:-1]:
dt += (ss['stop'] - ss['start']).total_seconds() dt += (ss['stop'] - ss['start']).total_seconds()

View file

@ -499,6 +499,7 @@ class BrozzlerWorker:
except: except:
self.logger.critical("unexpected exception", exc_info=True) self.logger.critical("unexpected exception", exc_info=True)
finally: finally:
site.active_brozzling_time = (site.active_brozzling_time or 0) + time.time() - start
self._frontier.disclaim_site(site, page) self._frontier.disclaim_site(site, page)
def _brozzle_site_thread_target(self, browser, site): def _brozzle_site_thread_target(self, browser, site):

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b12.dev262', version='1.1b12.dev263',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',