Merge branch 'optimizes-rethinkdb-load-query' into qa

This commit is contained in:
James Kafader 2020-03-11 16:09:52 -07:00
commit 59b67ac3d5

View File

@ -81,12 +81,33 @@ class BrozzlerWorker:
warcproxes = self._service_registry.available_services('warcprox')
if not warcproxes:
return None
warcproxes.sort(key=lambda warcprox: (warcprox['load']))
num_choices = 5
if len(warcproxes) < num_choices:
num_choices = len(warcproxes)
# .group('proxy').count() makes this query about 99% more efficient
reql = self._frontier.rr.table('sites').between(
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
index='sites_last_disclaimed').group('proxy').count()
# returns results like
# [ {
# "group": "wbgrp-svc030.us.archive.org:8000" ,
# "reduction": 148
# } ,
# {
# "group": "wbgrp-svc030.us.archive.org:8001" ,
# "reduction": 145
# }]
proxy_list = list(reql.run())
# convert to structure like:
# {
# "wbgrp-svc030.us.archive.org:8000": 148,
# "wbgrp-svc030.us.archive.org:8001": 145
# }
proxy_scoreboard = {proxy['group']: proxy['reduction'] for proxy in proxy_list}
for warcprox in warcproxes:
address = '%s:%s' % (warcprox['host'], warcprox['port'])
warcprox['assigned_sites'] = proxy_scoreboard.get('address', 0)
warcproxes.sort(key=lambda warcprox: (
warcprox['assigned_sites'], warcprox['load']))
# XXX make this heuristic more advanced?
return random.choice(warcproxes[0:num_choices])
return warcproxes[0]
def _proxy_for(self, site):
if self._proxy: