--warcprox-auto distribute assigned sites evenly

When running with --warcprox-auto, choose the instance of warcprox with
the least number of assigned sites, instead of the lowest load in the
service registry. In practice we often start brozzling a whole bunch of
sites at approximately the same time, and because it takes time for that
to affect the "load" reported by warcprox instances, sites end up being
distributed very unevenly.
This commit is contained in:
Noah Levitt 2018-01-19 11:35:06 -08:00
parent 9e80a3b0d3
commit bc4b2f3145
2 changed files with 18 additions and 1 deletions

0
brozzler/cli.py Executable file → Normal file
View File

View File

@ -129,13 +129,30 @@ class BrozzlerWorker:
self._start_stop_lock = threading.Lock()
self._shutdown = threading.Event()
def _choose_warcprox(self):
warcproxes = self._service_registry.available_services('warcprox')
if not warcproxes:
return None
active_sites = self._frontier.rr.table('sites').between(
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
index='sites_last_disclaimed').run()
for warcprox in warcproxes:
address = '%s:%s' % (warcprox['host'], warcprox['port'])
warcprox['assigned_sites'] = len([
site for site in active_sites
if 'proxy' in site and site['proxy'] == address])
warcproxes.sort(key=lambda warcprox: (
warcprox['assigned_sites'], warcprox['load']))
# XXX make this heuristic more advanced?
return warcproxes[0]
def _proxy_for(self, site):
if self._proxy:
return self._proxy
elif site.proxy:
return site.proxy
elif self._warcprox_auto:
svc = self._service_registry.available_service('warcprox')
svc = self._choose_warcprox()
if svc is None:
raise brozzler.ProxyError(
'no available instances of warcprox in the service '