Merge branch 'master' of github.com:nlevitt/brozzler

* 'master' of github.com:nlevitt/brozzler:
  fix bug preventing brozzler from simultaneously working on more than one site from the same job
This commit is contained in:
Noah Levitt 2016-04-04 22:43:18 -07:00
commit ed0ea24de6
2 changed files with 5 additions and 1 deletions

View File

@ -96,11 +96,12 @@ class RethinkDbFrontier:
["ACTIVE",rethinkdb.minval], ["ACTIVE",rethinkdb.minval],
["ACTIVE",rethinkdb.maxval], ["ACTIVE",rethinkdb.maxval],
index="sites_last_disclaimed") index="sites_last_disclaimed")
.order_by(index="sites_last_disclaimed").limit(1) .order_by(index="sites_last_disclaimed")
.filter( .filter(
(rethinkdb.row["claimed"] != True) | (rethinkdb.row["claimed"] != True) |
(rethinkdb.row["last_disclaimed"] (rethinkdb.row["last_disclaimed"]
< rethinkdb.now() - 2*60*60)) < rethinkdb.now() - 2*60*60))
.limit(1)
.update({"claimed":True,"last_claimed_by":worker_id}, .update({"claimed":True,"last_claimed_by":worker_id},
return_changes=True)).run() return_changes=True)).run()
self._vet_result(result, replaced=[0,1], unchanged=[0,1]) self._vet_result(result, replaced=[0,1], unchanged=[0,1])

View File

@ -43,6 +43,9 @@ class Site(brozzler.BaseDictable):
repr(self.proxy), self.enable_warcprox_features, repr(self.proxy), self.enable_warcprox_features,
self.ignore_robots, self.extra_headers, self.reached_limit) self.ignore_robots, self.extra_headers, self.reached_limit)
def __str__(self):
return "site-%s-%s" % (self.id, self.seed)
def _to_surt(self, url): def _to_surt(self, url):
hurl = surt.handyurl.parse(url) hurl = surt.handyurl.parse(url)
surt.GoogleURLCanonicalizer.canonicalize(hurl) surt.GoogleURLCanonicalizer.canonicalize(hurl)