From 40522ef5a539b9d4c8b57490d1a08cd09e9e45f2 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 23 Sep 2015 01:53:05 +0000 Subject: [PATCH] fix some rethinkdb related stuff; most notably r.desc() and related stuff don't currently work correctly if r is a Rethinker, so use rethinkdb directly in that case --- brozzler/__init__.py | 5 ----- brozzler/frontier.py | 12 +++++++----- brozzler/site.py | 2 -- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/brozzler/__init__.py b/brozzler/__init__.py index 7715ecf..f87a0f7 100644 --- a/brozzler/__init__.py +++ b/brozzler/__init__.py @@ -10,11 +10,6 @@ def _read_version(): version = _read_version() -# XXX don't know if these should be restricted; right now, only needed for -# rethinkdb "between" query -MAX_PRIORITY = 1000000000 -MIN_PRIORITY = -1000000000 - class ShutdownRequested(Exception): pass diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 4d232bd..3c405f5 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -3,6 +3,7 @@ import brozzler import random import time import datetime +import rethinkdb class UnexpectedDbResult(Exception): pass @@ -25,12 +26,12 @@ class RethinkDbFrontier: if not "sites" in tables: self.logger.info("creating rethinkdb table 'sites' in database %s", repr(self.r.dbname)) self.r.table_create("sites", shards=self.shards, replicas=self.replicas).run() - self.r.table("sites").index_create("sites_last_disclaimed", [r.row["status"], r.row["claimed"], r.row["last_disclaimed"]]).run() + self.r.table("sites").index_create("sites_last_disclaimed", [self.r.row["status"], self.r.row["claimed"], self.r.row["last_disclaimed"]]).run() self.r.table("sites").index_create("job_id").run() if not "pages" in tables: self.logger.info("creating rethinkdb table 'pages' in database %s", repr(self.r.dbname)) self.r.table_create("pages", shards=self.shards, replicas=self.replicas).run() - self.r.table("pages").index_create("priority_by_site", [r.row["site_id"], r.row["brozzle_count"], r.row["claimed"], r.row["priority"]]).run() + self.r.table("pages").index_create("priority_by_site", [self.r.row["site_id"], self.r.row["brozzle_count"], self.r.row["claimed"], self.r.row["priority"]]).run() if not "jobs" in tables: self.logger.info("creating rethinkdb table 'jobs' in database %s", repr(self.r.dbname)) self.r.table_create("jobs", shards=self.shards, replicas=self.replicas).run() @@ -114,9 +115,10 @@ class RethinkDbFrontier: return False def claim_page(self, site, worker_id): + # import pdb; pdb.set_trace() result = (self.r.table("pages") - .between([site.id,0,False,brozzler.MIN_PRIORITY], [site.id,0,False,brozzler.MAX_PRIORITY], index="priority_by_site") - .order_by(index=r.desc("priority_by_site")).limit(1) + .between([site.id, 0, False, self.r.minval], [site.id, 0, False, self.r.maxval], index="priority_by_site") + .order_by(index=rethinkdb.desc("priority_by_site")).limit(1) .update({"claimed":True,"last_claimed_by":worker_id},return_changes=True)).run() self.logger.info("query returned %s", result) self._vet_result(result, replaced=[0,1]) @@ -126,7 +128,7 @@ class RethinkDbFrontier: raise brozzler.NothingToClaim def has_outstanding_pages(self, site): - results_iter = self.r.table("pages").between([site.id,0,False,brozzler.MIN_PRIORITY], [site.id,0,True,brozzler.MAX_PRIORITY], index="priority_by_site").limit(1).run() + results_iter = self.r.table("pages").between([site.id, 0, False, self.r.minval], [site.id, 0, True, self.r.maxval], index="priority_by_site").limit(1).run() return len(list(results_iter)) > 0 def page(self, id): diff --git a/brozzler/site.py b/brozzler/site.py index fc5186a..5aecc34 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -110,8 +110,6 @@ class Page(brozzler.BaseDictable): priority = 0 priority += max(0, 10 - self.hops_from_seed) priority += max(0, 6 - self.canon_url().count("/")) - priority = max(priority, brozzler.MIN_PRIORITY) - priority = min(priority, brozzler.MAX_PRIORITY) return priority def canon_url(self):