mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-21 16:16:28 -04:00
--warcprox-auto distribute assigned sites evenly
When running with --warcprox-auto, choose the instance of warcprox with the least number of assigned sites, instead of the lowest load in the service registry. In practice we often start brozzling a whole bunch of sites at approximately the same time, and because it takes time for that to affect the "load" reported by warcprox instances, sites end up being distributed very unevenly.
This commit is contained in:
parent
9e80a3b0d3
commit
bc4b2f3145
0
brozzler/cli.py
Executable file → Normal file
0
brozzler/cli.py
Executable file → Normal file
@ -129,13 +129,30 @@ class BrozzlerWorker:
|
||||
self._start_stop_lock = threading.Lock()
|
||||
self._shutdown = threading.Event()
|
||||
|
||||
def _choose_warcprox(self):
|
||||
warcproxes = self._service_registry.available_services('warcprox')
|
||||
if not warcproxes:
|
||||
return None
|
||||
active_sites = self._frontier.rr.table('sites').between(
|
||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
||||
index='sites_last_disclaimed').run()
|
||||
for warcprox in warcproxes:
|
||||
address = '%s:%s' % (warcprox['host'], warcprox['port'])
|
||||
warcprox['assigned_sites'] = len([
|
||||
site for site in active_sites
|
||||
if 'proxy' in site and site['proxy'] == address])
|
||||
warcproxes.sort(key=lambda warcprox: (
|
||||
warcprox['assigned_sites'], warcprox['load']))
|
||||
# XXX make this heuristic more advanced?
|
||||
return warcproxes[0]
|
||||
|
||||
def _proxy_for(self, site):
|
||||
if self._proxy:
|
||||
return self._proxy
|
||||
elif site.proxy:
|
||||
return site.proxy
|
||||
elif self._warcprox_auto:
|
||||
svc = self._service_registry.available_service('warcprox')
|
||||
svc = self._choose_warcprox()
|
||||
if svc is None:
|
||||
raise brozzler.ProxyError(
|
||||
'no available instances of warcprox in the service '
|
||||
|
Loading…
x
Reference in New Issue
Block a user