mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
add methods to get all sites for a job, seed page for a site
This commit is contained in:
parent
2c7c713f00
commit
e210d417fb
@ -286,3 +286,19 @@ class RethinkDbFrontier:
|
||||
else:
|
||||
site.reached_limit = e.warcprox_meta["reached-limit"]
|
||||
self.finished(site, "FINISHED_REACHED_LIMIT")
|
||||
|
||||
def job_sites(self, job_id):
|
||||
results = self.r.table('sites').get_all(job_id, index="job_id").run()
|
||||
for result in results:
|
||||
yield brozzler.Site(**result)
|
||||
|
||||
def seed_page(self, site_id):
|
||||
results = self.r.table("pages").between(
|
||||
[site_id, self.r.minval, self.r.minval, self.r.minval],
|
||||
[site_id, self.r.maxval, self.r.maxval, self.r.maxval],
|
||||
index="priority_by_site").filter({"hops_from_seed":0}).run()
|
||||
pages = list(results)
|
||||
if len(pages) > 1:
|
||||
self.logger.warn(
|
||||
"more than one seed page for site_id %s ?", site_id)
|
||||
return brozzler.Page(**pages[0])
|
||||
|
Loading…
x
Reference in New Issue
Block a user