mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
Merge pull request #187 from internetarchive/optimizes-rethinkdb-load-query
With the last commit, the only test failure is unrelated test_brozzling.py::test_page_interstitial_exception (already marked xfail in qa).
This commit is contained in:
commit
4c0785fbfc
@ -81,12 +81,23 @@ class BrozzlerWorker:
|
|||||||
warcproxes = self._service_registry.available_services('warcprox')
|
warcproxes = self._service_registry.available_services('warcprox')
|
||||||
if not warcproxes:
|
if not warcproxes:
|
||||||
return None
|
return None
|
||||||
warcproxes.sort(key=lambda warcprox: (warcprox['load']))
|
# .group('proxy').count() makes this query about 99% more efficient
|
||||||
num_choices = 5
|
reql = self._frontier.rr.table('sites').between(
|
||||||
if len(warcproxes) < num_choices:
|
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
||||||
num_choices = len(warcproxes)
|
index='sites_last_disclaimed').group('proxy').count()
|
||||||
|
# returns results like
|
||||||
|
# {
|
||||||
|
# "wbgrp-svc030.us.archive.org:8000": 148,
|
||||||
|
# "wbgrp-svc030.us.archive.org:8001": 145
|
||||||
|
# }
|
||||||
|
proxy_scoreboard = dict(reql.run())
|
||||||
|
for warcprox in warcproxes:
|
||||||
|
address = '%s:%s' % (warcprox['host'], warcprox['port'])
|
||||||
|
warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0)
|
||||||
|
warcproxes.sort(key=lambda warcprox: (
|
||||||
|
warcprox['assigned_sites'], warcprox['load']))
|
||||||
# XXX make this heuristic more advanced?
|
# XXX make this heuristic more advanced?
|
||||||
return random.choice(warcproxes[0:num_choices])
|
return warcproxes[0]
|
||||||
|
|
||||||
def _proxy_for(self, site):
|
def _proxy_for(self, site):
|
||||||
if self._proxy:
|
if self._proxy:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user