Merge pull request #187 from internetarchive/optimizes-rethinkdb-load-query

With the last commit, the only test failure is unrelated test_brozzling.py::test_page_interstitial_exception (already marked xfail in qa).
This commit is contained in:
Barbara Miller 2020-03-11 21:29:01 -07:00 committed by GitHub
commit 4c0785fbfc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -81,12 +81,23 @@ class BrozzlerWorker:
warcproxes = self._service_registry.available_services('warcprox') warcproxes = self._service_registry.available_services('warcprox')
if not warcproxes: if not warcproxes:
return None return None
warcproxes.sort(key=lambda warcprox: (warcprox['load'])) # .group('proxy').count() makes this query about 99% more efficient
num_choices = 5 reql = self._frontier.rr.table('sites').between(
if len(warcproxes) < num_choices: ['ACTIVE', r.minval], ['ACTIVE', r.maxval],
num_choices = len(warcproxes) index='sites_last_disclaimed').group('proxy').count()
# returns results like
# {
# "wbgrp-svc030.us.archive.org:8000": 148,
# "wbgrp-svc030.us.archive.org:8001": 145
# }
proxy_scoreboard = dict(reql.run())
for warcprox in warcproxes:
address = '%s:%s' % (warcprox['host'], warcprox['port'])
warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0)
warcproxes.sort(key=lambda warcprox: (
warcprox['assigned_sites'], warcprox['load']))
# XXX make this heuristic more advanced? # XXX make this heuristic more advanced?
return random.choice(warcproxes[0:num_choices]) return warcproxes[0]
def _proxy_for(self, site): def _proxy_for(self, site):
if self._proxy: if self._proxy: