mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-20 23:56:34 -04:00
Issue #231 - How does worker pick a site after crash?
- Configurable claimed limit as it was hard coded to 60. The nodes in case of crash can come back in fairly quick time.
This commit is contained in:
parent
4f301f4e03
commit
51b2474b3c
@ -327,6 +327,10 @@ def brozzler_worker(argv=None):
|
||||
help=(
|
||||
'when needed, choose an available instance of warcprox from '
|
||||
'the rethinkdb service registry'))
|
||||
arg_parser.add_argument(
|
||||
'--claimed-limit', metavar='int', type=int, default=60, choices=range(15, 61), dest='claimed_limit',
|
||||
help=('Minutes after worker crash, a site can be reclaimed'
|
||||
'an integer in the range 15..60 (default: 60)'))
|
||||
arg_parser.add_argument(
|
||||
'--skip-extract-outlinks', dest='skip_extract_outlinks',
|
||||
action='store_true', help=argparse.SUPPRESS)
|
||||
@ -370,6 +374,7 @@ def brozzler_worker(argv=None):
|
||||
frontier, service_registry, max_browsers=int(args.max_browsers),
|
||||
chrome_exe=args.chrome_exe, proxy=args.proxy,
|
||||
warcprox_auto=args.warcprox_auto,
|
||||
claimed_limit=args.claimed_limit,
|
||||
skip_extract_outlinks=args.skip_extract_outlinks,
|
||||
skip_visit_hashtags=args.skip_visit_hashtags,
|
||||
skip_youtube_dl=args.skip_youtube_dl)
|
||||
|
@ -93,7 +93,7 @@ class RethinkDbFrontier:
|
||||
raise UnexpectedDbResult("expected %r to be %r in %r" % (
|
||||
k, expected, result))
|
||||
|
||||
def claim_sites(self, n=1):
|
||||
def claim_sites(self, n=1, claimed_limit=60):
|
||||
self.logger.trace('claiming up to %s sites to brozzle', n)
|
||||
result = (
|
||||
self.rr.table('sites').get_all(r.args(
|
||||
@ -114,7 +114,7 @@ class RethinkDbFrontier:
|
||||
r.and_(
|
||||
r.or_(
|
||||
site['claimed'].not_(),
|
||||
site['last_claimed'].lt(r.now().sub(60*60))),
|
||||
site['last_claimed'].lt(r.now().sub(claimed_limit*60))),
|
||||
r.or_(
|
||||
site.has_fields('max_claimed_sites').not_(),
|
||||
new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
|
||||
@ -127,7 +127,7 @@ class RethinkDbFrontier:
|
||||
r.branch(
|
||||
r.or_(
|
||||
r.row['claimed'].not_(),
|
||||
r.row['last_claimed'].lt(r.now().sub(60*60))),
|
||||
r.row['last_claimed'].lt(r.now().sub(claimed_limit*60))),
|
||||
{'claimed': True, 'last_claimed': r.now()},
|
||||
{}),
|
||||
return_changes=True)).run()
|
||||
|
@ -49,7 +49,7 @@ class BrozzlerWorker:
|
||||
|
||||
def __init__(
|
||||
self, frontier, service_registry=None, max_browsers=1,
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
|
||||
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, claimed_limit=60,
|
||||
skip_extract_outlinks=False, skip_visit_hashtags=False,
|
||||
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
|
||||
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
|
||||
@ -62,6 +62,7 @@ class BrozzlerWorker:
|
||||
self._proxy = proxy
|
||||
assert not (warcprox_auto and proxy)
|
||||
self._proxy_is_warcprox = None
|
||||
self._claimed_limit = claimed_limit
|
||||
self._skip_extract_outlinks = skip_extract_outlinks
|
||||
self._skip_visit_hashtags = skip_visit_hashtags
|
||||
self._skip_youtube_dl = skip_youtube_dl
|
||||
@ -488,7 +489,7 @@ class BrozzlerWorker:
|
||||
browsers = self._browser_pool.acquire_multi(
|
||||
(self._browser_pool.num_available() + 1) // 2)
|
||||
try:
|
||||
sites = self._frontier.claim_sites(len(browsers))
|
||||
sites = self._frontier.claim_sites(len(browsers), self._claimed_limit)
|
||||
except:
|
||||
self._browser_pool.release_all(browsers)
|
||||
raise
|
||||
|
Loading…
x
Reference in New Issue
Block a user