mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-08-10 07:20:39 -04:00
Merge branch 'max-claimed-sites' into qa
* max-claimed-sites: fix timely time limit enforcement honor stop request before choosing proxy fix query to make tests pass?
This commit is contained in:
commit
0c9ebcff6e
2 changed files with 19 additions and 9 deletions
|
@ -98,7 +98,7 @@ class RethinkDbFrontier:
|
||||||
Returns a dictionary that looks like this:
|
Returns a dictionary that looks like this:
|
||||||
{<job_id>: {'claimed_sites': 2, 'max_claimed_sites': 3}, ...}
|
{<job_id>: {'claimed_sites': 2, 'max_claimed_sites': 3}, ...}
|
||||||
'''
|
'''
|
||||||
# js query: r.db('brozzler').table('sites').between(['ACTIVE', r.minval], ['ACTIVE', r.maxval], {'index': 'sites_last_disclaimed'}).eqJoin('job_id', r.db('brozzler').table('jobs')).group(function(x){return x('right')('id')}).ungroup().map(function(x){return {'job_id': x('group'), 'max_claimed_sites':x('reduction')('right')('max_claimed_sites')(0), 'claimed_sites':x('reduction')('left').filter({'claimed':true}).count()}})
|
# js query: r.db('brozzler').table('sites').between(['ACTIVE', r.minval], ['ACTIVE', r.maxval], {'index': 'sites_last_disclaimed'}).eqJoin('job_id', r.db('brozzler').table('jobs')).filter(function(x){return x('right').hasFields('max_claimed_sites')}).group(function(x){return x('right')('id')}).ungroup().map(function(x){return {'job_id': x('group'), 'max_claimed_sites':x('reduction')('right')('max_claimed_sites')(0), 'claimed_sites':x('reduction')('left').filter({'claimed':true}).count()}})
|
||||||
# returns results like:
|
# returns results like:
|
||||||
# [{'max_claimed_sites': 2, 'claimed_sites': 0, 'job_id': 1234},
|
# [{'max_claimed_sites': 2, 'claimed_sites': 0, 'job_id': 1234},
|
||||||
# {'max_claimed_sites': 3, 'claimed_sites': 3, 'job_id': 1235}]
|
# {'max_claimed_sites': 3, 'claimed_sites': 3, 'job_id': 1235}]
|
||||||
|
@ -108,6 +108,7 @@ class RethinkDbFrontier:
|
||||||
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
['ACTIVE', r.minval], ['ACTIVE', r.maxval],
|
||||||
index='sites_last_disclaimed')
|
index='sites_last_disclaimed')
|
||||||
.eq_join('job_id', r.db(self.rr.dbname).table('jobs'))
|
.eq_join('job_id', r.db(self.rr.dbname).table('jobs'))
|
||||||
|
.filter(lambda x: x['right'].has_fields('max_claimed_sites'))
|
||||||
.group(lambda x: x['right']['id'])
|
.group(lambda x: x['right']['id'])
|
||||||
.ungroup()
|
.ungroup()
|
||||||
.map(lambda x: {
|
.map(lambda x: {
|
||||||
|
@ -115,10 +116,14 @@ class RethinkDbFrontier:
|
||||||
'max_claimed_sites': x['reduction']['right']['max_claimed_sites'][0],
|
'max_claimed_sites': x['reduction']['right']['max_claimed_sites'][0],
|
||||||
'claimed_sites': x['reduction']['left'].filter({'claimed':True}).count()
|
'claimed_sites': x['reduction']['left'].filter({'claimed':True}).count()
|
||||||
})).run()
|
})).run()
|
||||||
xformed = {d['job_id']: {
|
|
||||||
'claimed_sites': d['claimed_sites'],
|
xformed = {
|
||||||
'max_claimed_sites': d['max_claimed_sites']
|
d['job_id']: {
|
||||||
} for d in results}
|
'claimed_sites': d['claimed_sites'],
|
||||||
|
'max_claimed_sites': d['max_claimed_sites']
|
||||||
|
}
|
||||||
|
for d in results
|
||||||
|
}
|
||||||
return xformed
|
return xformed
|
||||||
|
|
||||||
def claim_sites(self, n=1):
|
def claim_sites(self, n=1):
|
||||||
|
@ -192,12 +197,13 @@ class RethinkDbFrontier:
|
||||||
else:
|
else:
|
||||||
raise brozzler.NothingToClaim
|
raise brozzler.NothingToClaim
|
||||||
|
|
||||||
def enforce_time_limit(self, site):
|
def enforce_time_limit(self, site, session_time=0):
|
||||||
'''
|
'''
|
||||||
Raises `brozzler.ReachedTimeLimit` if appropriate.
|
Raises `brozzler.ReachedTimeLimit` if appropriate.
|
||||||
'''
|
'''
|
||||||
if (site.time_limit and site.time_limit > 0
|
if (site.time_limit
|
||||||
and (site.active_brozzling_time or 0) > site.time_limit):
|
and site.time_limit > 0
|
||||||
|
and (site.active_brozzling_time or 0) + session_time > site.time_limit):
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
"site FINISHED_TIME_LIMIT! time_limit=%s "
|
"site FINISHED_TIME_LIMIT! time_limit=%s "
|
||||||
"active_brozzling_time=%s %s", site.time_limit,
|
"active_brozzling_time=%s %s", site.time_limit,
|
||||||
|
|
|
@ -507,12 +507,16 @@ class BrozzlerWorker:
|
||||||
site.save()
|
site.save()
|
||||||
start = time.time()
|
start = time.time()
|
||||||
page = None
|
page = None
|
||||||
|
self._frontier.enforce_time_limit(site)
|
||||||
|
self._frontier.honor_stop_request(site)
|
||||||
|
# _proxy_for() call in log statement can raise brozzler.ProxyError
|
||||||
|
# which is why we honor time limit and stop request first☝🏻
|
||||||
self.logger.info(
|
self.logger.info(
|
||||||
"brozzling site (proxy=%r) %r",
|
"brozzling site (proxy=%r) %r",
|
||||||
self._proxy_for(site), site)
|
self._proxy_for(site), site)
|
||||||
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
|
while time.time() - start < self.SITE_SESSION_MINUTES * 60:
|
||||||
site.refresh()
|
site.refresh()
|
||||||
self._frontier.enforce_time_limit(site)
|
self._frontier.enforce_time_limit(site, time.time() - start)
|
||||||
self._frontier.honor_stop_request(site)
|
self._frontier.honor_stop_request(site)
|
||||||
page = self._frontier.claim_page(site, "%s:%s" % (
|
page = self._frontier.claim_page(site, "%s:%s" % (
|
||||||
socket.gethostname(), browser.chrome.port))
|
socket.gethostname(), browser.chrome.port))
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue