to avoid infinite loops in some cases, ignore the "claimed" field in the rethinkdb table "pages", because if a page is left "claimed", it must have been because of some error... site.claimed is the real claiming mechanism

This commit is contained in:
Noah Levitt 2016-06-29 00:02:25 +00:00
parent 7a805a43d1
commit d04c5a31cc
2 changed files with 18 additions and 6 deletions

View file

@ -185,10 +185,19 @@ class RethinkDbFrontier:
return False return False
def claim_page(self, site, worker_id): def claim_page(self, site, worker_id):
result = (self.r.table("pages") # ignores the "claimed" field of the page, because only one
.between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site") # brozzler-worker can be working on a site at a time, and that would
.order_by(index=rethinkdb.desc("priority_by_site")).limit(1) # have to be the worker calling this method, so if something is claimed
.update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run() # already, it must have been left that way because of some error
result = self.r.table("pages").between(
[site.id, 0, self.r.minval, self.r.minval],
[site.id, 0, self.r.maxval, self.r.maxval],
index="priority_by_site").order_by(
index=rethinkdb.desc("priority_by_site")).limit(
1).update({
"claimed":True,
"last_claimed_by":worker_id},
return_changes=True).run()
self._vet_result(result, replaced=[0,1]) self._vet_result(result, replaced=[0,1])
if result["replaced"] == 1: if result["replaced"] == 1:
return brozzler.Page(**result["changes"][0]["new_val"]) return brozzler.Page(**result["changes"][0]["new_val"])
@ -196,7 +205,10 @@ class RethinkDbFrontier:
raise brozzler.NothingToClaim raise brozzler.NothingToClaim
def has_outstanding_pages(self, site): def has_outstanding_pages(self, site):
results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run() results_iter = self.r.table("pages").between(
[site.id, 0, self.r.minval, self.r.minval],
[site.id, 0, self.r.maxval, self.r.maxval],
index="priority_by_site").limit(1).run()
return len(list(results_iter)) > 0 return len(list(results_iter)) > 0
def page(self, id): def page(self, id):

View file

@ -21,7 +21,7 @@ import setuptools
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1.dev30', version='1.1.dev31',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',