From e210d417fbc244cb0ba6aee91724efe5b4c0eb40 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 25 Apr 2016 17:01:56 +0000 Subject: [PATCH] add methods to get all sites for a job, seed page for a site --- brozzler/frontier.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 037afbe..3cf9c5f 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -286,3 +286,19 @@ class RethinkDbFrontier: else: site.reached_limit = e.warcprox_meta["reached-limit"] self.finished(site, "FINISHED_REACHED_LIMIT") + + def job_sites(self, job_id): + results = self.r.table('sites').get_all(job_id, index="job_id").run() + for result in results: + yield brozzler.Site(**result) + + def seed_page(self, site_id): + results = self.r.table("pages").between( + [site_id, self.r.minval, self.r.minval, self.r.minval], + [site_id, self.r.maxval, self.r.maxval, self.r.maxval], + index="priority_by_site").filter({"hops_from_seed":0}).run() + pages = list(results) + if len(pages) > 1: + self.logger.warn( + "more than one seed page for site_id %s ?", site_id) + return brozzler.Page(**pages[0])