mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-05-02 14:46:18 -04:00
to avoid infinite loops in some cases, ignore the "claimed" field in the rethinkdb table "pages", because if a page is left "claimed", it must have been because of some error... site.claimed is the real claiming mechanism
This commit is contained in:
parent
7a805a43d1
commit
d04c5a31cc
2 changed files with 18 additions and 6 deletions
|
@ -185,10 +185,19 @@ class RethinkDbFrontier:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def claim_page(self, site, worker_id):
|
def claim_page(self, site, worker_id):
|
||||||
result = (self.r.table("pages")
|
# ignores the "claimed" field of the page, because only one
|
||||||
.between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site")
|
# brozzler-worker can be working on a site at a time, and that would
|
||||||
.order_by(index=rethinkdb.desc("priority_by_site")).limit(1)
|
# have to be the worker calling this method, so if something is claimed
|
||||||
.update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run()
|
# already, it must have been left that way because of some error
|
||||||
|
result = self.r.table("pages").between(
|
||||||
|
[site.id, 0, self.r.minval, self.r.minval],
|
||||||
|
[site.id, 0, self.r.maxval, self.r.maxval],
|
||||||
|
index="priority_by_site").order_by(
|
||||||
|
index=rethinkdb.desc("priority_by_site")).limit(
|
||||||
|
1).update({
|
||||||
|
"claimed":True,
|
||||||
|
"last_claimed_by":worker_id},
|
||||||
|
return_changes=True).run()
|
||||||
self._vet_result(result, replaced=[0,1])
|
self._vet_result(result, replaced=[0,1])
|
||||||
if result["replaced"] == 1:
|
if result["replaced"] == 1:
|
||||||
return brozzler.Page(**result["changes"][0]["new_val"])
|
return brozzler.Page(**result["changes"][0]["new_val"])
|
||||||
|
@ -196,7 +205,10 @@ class RethinkDbFrontier:
|
||||||
raise brozzler.NothingToClaim
|
raise brozzler.NothingToClaim
|
||||||
|
|
||||||
def has_outstanding_pages(self, site):
|
def has_outstanding_pages(self, site):
|
||||||
results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run()
|
results_iter = self.r.table("pages").between(
|
||||||
|
[site.id, 0, self.r.minval, self.r.minval],
|
||||||
|
[site.id, 0, self.r.maxval, self.r.maxval],
|
||||||
|
index="priority_by_site").limit(1).run()
|
||||||
return len(list(results_iter)) > 0
|
return len(list(results_iter)) > 0
|
||||||
|
|
||||||
def page(self, id):
|
def page(self, id):
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -21,7 +21,7 @@ import setuptools
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1.dev30',
|
version='1.1.dev31',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue