From bc4b2f314580f087c37d90dc939a55cc73c8d93a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 19 Jan 2018 11:35:06 -0800 Subject: [PATCH] --warcprox-auto distribute assigned sites evenly When running with --warcprox-auto, choose the instance of warcprox with the least number of assigned sites, instead of the lowest load in the service registry. In practice we often start brozzling a whole bunch of sites at approximately the same time, and because it takes time for that to affect the "load" reported by warcprox instances, sites end up being distributed very unevenly. --- brozzler/cli.py | 0 brozzler/worker.py | 19 ++++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) mode change 100755 => 100644 brozzler/cli.py diff --git a/brozzler/cli.py b/brozzler/cli.py old mode 100755 new mode 100644 diff --git a/brozzler/worker.py b/brozzler/worker.py index 4955f2c..49a120a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -129,13 +129,30 @@ class BrozzlerWorker: self._start_stop_lock = threading.Lock() self._shutdown = threading.Event() + def _choose_warcprox(self): + warcproxes = self._service_registry.available_services('warcprox') + if not warcproxes: + return None + active_sites = self._frontier.rr.table('sites').between( + ['ACTIVE', r.minval], ['ACTIVE', r.maxval], + index='sites_last_disclaimed').run() + for warcprox in warcproxes: + address = '%s:%s' % (warcprox['host'], warcprox['port']) + warcprox['assigned_sites'] = len([ + site for site in active_sites + if 'proxy' in site and site['proxy'] == address]) + warcproxes.sort(key=lambda warcprox: ( + warcprox['assigned_sites'], warcprox['load'])) + # XXX make this heuristic more advanced? + return warcproxes[0] + def _proxy_for(self, site): if self._proxy: return self._proxy elif site.proxy: return site.proxy elif self._warcprox_auto: - svc = self._service_registry.available_service('warcprox') + svc = self._choose_warcprox() if svc is None: raise brozzler.ProxyError( 'no available instances of warcprox in the service '