diff --git a/brozzler/worker.py b/brozzler/worker.py index ea2e3cc..6021deb 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -81,12 +81,33 @@ class BrozzlerWorker: warcproxes = self._service_registry.available_services('warcprox') if not warcproxes: return None - warcproxes.sort(key=lambda warcprox: (warcprox['load'])) - num_choices = 5 - if len(warcproxes) < num_choices: - num_choices = len(warcproxes) + # .group('proxy').count() makes this query about 99% more efficient + reql = self._frontier.rr.table('sites').between( + ['ACTIVE', r.minval], ['ACTIVE', r.maxval], + index='sites_last_disclaimed').group('proxy').count() + # returns results like + # [ { + # "group": "wbgrp-svc030.us.archive.org:8000" , + # "reduction": 148 + # } , + # { + # "group": "wbgrp-svc030.us.archive.org:8001" , + # "reduction": 145 + # }] + proxy_list = list(reql.run()) + # convert to structure like: + # { + # "wbgrp-svc030.us.archive.org:8000": 148, + # "wbgrp-svc030.us.archive.org:8001": 145 + # } + proxy_scoreboard = {proxy['group']: proxy['reduction'] for proxy in proxy_list} + for warcprox in warcproxes: + address = '%s:%s' % (warcprox['host'], warcprox['port']) + warcprox['assigned_sites'] = proxy_scoreboard.get('address', 0) + warcproxes.sort(key=lambda warcprox: ( + warcprox['assigned_sites'], warcprox['load'])) # XXX make this heuristic more advanced? - return random.choice(warcproxes[0:num_choices]) + return warcproxes[0] def _proxy_for(self, site): if self._proxy: