mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 16:49:56 -05:00
add methods to get all sites for a job, seed page for a site
This commit is contained in:
parent
2c7c713f00
commit
e210d417fb
@ -286,3 +286,19 @@ class RethinkDbFrontier:
|
|||||||
else:
|
else:
|
||||||
site.reached_limit = e.warcprox_meta["reached-limit"]
|
site.reached_limit = e.warcprox_meta["reached-limit"]
|
||||||
self.finished(site, "FINISHED_REACHED_LIMIT")
|
self.finished(site, "FINISHED_REACHED_LIMIT")
|
||||||
|
|
||||||
|
def job_sites(self, job_id):
|
||||||
|
results = self.r.table('sites').get_all(job_id, index="job_id").run()
|
||||||
|
for result in results:
|
||||||
|
yield brozzler.Site(**result)
|
||||||
|
|
||||||
|
def seed_page(self, site_id):
|
||||||
|
results = self.r.table("pages").between(
|
||||||
|
[site_id, self.r.minval, self.r.minval, self.r.minval],
|
||||||
|
[site_id, self.r.maxval, self.r.maxval, self.r.maxval],
|
||||||
|
index="priority_by_site").filter({"hops_from_seed":0}).run()
|
||||||
|
pages = list(results)
|
||||||
|
if len(pages) > 1:
|
||||||
|
self.logger.warn(
|
||||||
|
"more than one seed page for site_id %s ?", site_id)
|
||||||
|
return brozzler.Page(**pages[0])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user