From 3defd49677c90eb42652f1101645b31b02823d53 Mon Sep 17 00:00:00 2001 From: James Kafader Date: Wed, 11 Mar 2020 16:09:16 -0700 Subject: [PATCH 1/4] new selection function, based on optimized query --- brozzler/worker.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index ea2e3cc..6021deb 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -81,12 +81,33 @@ class BrozzlerWorker: warcproxes = self._service_registry.available_services('warcprox') if not warcproxes: return None - warcproxes.sort(key=lambda warcprox: (warcprox['load'])) - num_choices = 5 - if len(warcproxes) < num_choices: - num_choices = len(warcproxes) + # .group('proxy').count() makes this query about 99% more efficient + reql = self._frontier.rr.table('sites').between( + ['ACTIVE', r.minval], ['ACTIVE', r.maxval], + index='sites_last_disclaimed').group('proxy').count() + # returns results like + # [ { + # "group": "wbgrp-svc030.us.archive.org:8000" , + # "reduction": 148 + # } , + # { + # "group": "wbgrp-svc030.us.archive.org:8001" , + # "reduction": 145 + # }] + proxy_list = list(reql.run()) + # convert to structure like: + # { + # "wbgrp-svc030.us.archive.org:8000": 148, + # "wbgrp-svc030.us.archive.org:8001": 145 + # } + proxy_scoreboard = {proxy['group']: proxy['reduction'] for proxy in proxy_list} + for warcprox in warcproxes: + address = '%s:%s' % (warcprox['host'], warcprox['port']) + warcprox['assigned_sites'] = proxy_scoreboard.get('address', 0) + warcproxes.sort(key=lambda warcprox: ( + warcprox['assigned_sites'], warcprox['load'])) # XXX make this heuristic more advanced? - return random.choice(warcproxes[0:num_choices]) + return warcproxes[0] def _proxy_for(self, site): if self._proxy: From b9c5e4b66ceff9bd72bbf29e63933f88c6556f80 Mon Sep 17 00:00:00 2001 From: James Kafader Date: Wed, 11 Mar 2020 19:15:57 -0700 Subject: [PATCH 2/4] fix output format --- brozzler/worker.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 6021deb..bb56b0a 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -86,21 +86,11 @@ class BrozzlerWorker: ['ACTIVE', r.minval], ['ACTIVE', r.maxval], index='sites_last_disclaimed').group('proxy').count() # returns results like - # [ { - # "group": "wbgrp-svc030.us.archive.org:8000" , - # "reduction": 148 - # } , - # { - # "group": "wbgrp-svc030.us.archive.org:8001" , - # "reduction": 145 - # }] - proxy_list = list(reql.run()) - # convert to structure like: # { # "wbgrp-svc030.us.archive.org:8000": 148, # "wbgrp-svc030.us.archive.org:8001": 145 # } - proxy_scoreboard = {proxy['group']: proxy['reduction'] for proxy in proxy_list} + proxy_scoreboard = list(reql.run()) for warcprox in warcproxes: address = '%s:%s' % (warcprox['host'], warcprox['port']) warcprox['assigned_sites'] = proxy_scoreboard.get('address', 0) From 313cec3139dc6485f791d5cd0aacc8f946130481 Mon Sep 17 00:00:00 2001 From: James Kafader Date: Wed, 11 Mar 2020 19:31:02 -0700 Subject: [PATCH 3/4] coerce to dict not list --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index bb56b0a..39287d7 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -90,7 +90,7 @@ class BrozzlerWorker: # "wbgrp-svc030.us.archive.org:8000": 148, # "wbgrp-svc030.us.archive.org:8001": 145 # } - proxy_scoreboard = list(reql.run()) + proxy_scoreboard = dict(reql.run()) for warcprox in warcproxes: address = '%s:%s' % (warcprox['host'], warcprox['port']) warcprox['assigned_sites'] = proxy_scoreboard.get('address', 0) From c4beeefe015cb7dd8fdac2c636af890e1a325edc Mon Sep 17 00:00:00 2001 From: Barbara Miller Date: Wed, 11 Mar 2020 20:56:52 -0700 Subject: [PATCH 4/4] address var --- brozzler/worker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/brozzler/worker.py b/brozzler/worker.py index 39287d7..7437927 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -93,7 +93,7 @@ class BrozzlerWorker: proxy_scoreboard = dict(reql.run()) for warcprox in warcproxes: address = '%s:%s' % (warcprox['host'], warcprox['port']) - warcprox['assigned_sites'] = proxy_scoreboard.get('address', 0) + warcprox['assigned_sites'] = proxy_scoreboard.get(address, 0) warcproxes.sort(key=lambda warcprox: ( warcprox['assigned_sites'], warcprox['load'])) # XXX make this heuristic more advanced?