fix some rethinkdb related stuff; most notably r.desc() and related stuff don't currently work correctly if r is a Rethinker, so use rethinkdb directly in that case

This commit is contained in:
Noah Levitt 2015-09-23 01:53:05 +00:00
parent 8bf34d9db6
commit 40522ef5a5
3 changed files with 7 additions and 12 deletions

View File

@ -10,11 +10,6 @@ def _read_version():
version = _read_version()
# XXX don't know if these should be restricted; right now, only needed for
# rethinkdb "between" query
MAX_PRIORITY = 1000000000
MIN_PRIORITY = -1000000000
class ShutdownRequested(Exception):
pass

View File

@ -3,6 +3,7 @@ import brozzler
import random
import time
import datetime
import rethinkdb
class UnexpectedDbResult(Exception):
pass
@ -25,12 +26,12 @@ class RethinkDbFrontier:
if not "sites" in tables:
self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname))
self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run()
self.r.table("sites").index_create("sites_last_disclaimed", [r.row["status"], r.row["claimed"], r.row["last_disclaimed"]]).run()
self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["claimed"], self.r.row["last_disclaimed"]]).run()
self.r.table("sites").index_create("job_id").run()
if not "pages" in tables:
self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname))
self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run()
self.r.table("pages").index_create("priority_by_site", [r.row["site_id"], r.row["brozzle_count"], r.row["claimed"], r.row["priority"]]).run()
self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run()
if not "jobs" in tables:
self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname))
self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run()
@ -114,9 +115,10 @@ class RethinkDbFrontier:
return False
def claim_page(self, site, worker_id):
# import pdb; pdb.set_trace()
result = (self.r.table("pages")
.between([site.id,0,False,brozzler.MIN_PRIORITY], [site.id,0,False,brozzler.MAX_PRIORITY], index="priority_by_site")
.order_by(index=r.desc("priority_by_site")).limit(1)
.between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site")
.order_by(index=rethinkdb.desc("priority_by_site")).limit(1)
.update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run()
self.logger.info("query returned %s", result)
self._vet_result(result, replaced=[0,1])
@ -126,7 +128,7 @@ class RethinkDbFrontier:
raise brozzler.NothingToClaim
def has_outstanding_pages(self, site):
results_iter = self.r.table("pages").between([site.id,0,False,brozzler.MIN_PRIORITY], [site.id,0,True,brozzler.MAX_PRIORITY], index="priority_by_site").limit(1).run()
results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run()
return len(list(results_iter)) > 0
def page(self, id):

View File

@ -110,8 +110,6 @@ class Page(brozzler.BaseDictable):
priority = 0
priority += max(0, 10 - self.hops_from_seed)
priority += max(0, 6 - self.canon_url().count("/"))
priority = max(priority, brozzler.MIN_PRIORITY)
priority = min(priority, brozzler.MAX_PRIORITY)
return priority
def canon_url(self):