mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-05-02 14:46:18 -04:00
to avoid infinite loops in some cases, ignore the "claimed" field in the rethinkdb table "pages", because if a page is left "claimed", it must have been because of some error... site.claimed is the real claiming mechanism
This commit is contained in:
parent
7a805a43d1
commit
d04c5a31cc
2 changed files with 18 additions and 6 deletions
|
@ -185,10 +185,19 @@ class RethinkDbFrontier:
|
|||
return False
|
||||
|
||||
def claim_page(self, site, worker_id):
|
||||
result = (self.r.table("pages")
|
||||
.between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site")
|
||||
.order_by(index=rethinkdb.desc("priority_by_site")).limit(1)
|
||||
.update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run()
|
||||
# ignores the "claimed" field of the page, because only one
|
||||
# brozzler-worker can be working on a site at a time, and that would
|
||||
# have to be the worker calling this method, so if something is claimed
|
||||
# already, it must have been left that way because of some error
|
||||
result = self.r.table("pages").between(
|
||||
[site.id, 0, self.r.minval, self.r.minval],
|
||||
[site.id, 0, self.r.maxval, self.r.maxval],
|
||||
index="priority_by_site").order_by(
|
||||
index=rethinkdb.desc("priority_by_site")).limit(
|
||||
1).update({
|
||||
"claimed":True,
|
||||
"last_claimed_by":worker_id},
|
||||
return_changes=True).run()
|
||||
self._vet_result(result, replaced=[0,1])
|
||||
if result["replaced"] == 1:
|
||||
return brozzler.Page(**result["changes"][0]["new_val"])
|
||||
|
@ -196,7 +205,10 @@ class RethinkDbFrontier:
|
|||
raise brozzler.NothingToClaim
|
||||
|
||||
def has_outstanding_pages(self, site):
|
||||
results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run()
|
||||
results_iter = self.r.table("pages").between(
|
||||
[site.id, 0, self.r.minval, self.r.minval],
|
||||
[site.id, 0, self.r.maxval, self.r.maxval],
|
||||
index="priority_by_site").limit(1).run()
|
||||
return len(list(results_iter)) > 0
|
||||
|
||||
def page(self, id):
|
||||
|
|
2
setup.py
2
setup.py
|
@ -21,7 +21,7 @@ import setuptools
|
|||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1.dev30',
|
||||
version='1.1.dev31',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue