rename page.hops_off_surt to page.hops_off

This commit is contained in:
Noah Levitt 2018-03-22 17:07:52 -07:00
parent 60f2b99cc0
commit b83d3cb9df
3 changed files with 14 additions and 10 deletions

View File

@ -315,8 +315,8 @@ class RethinkDbFrontier:
existing_page.priority += fresh_page.priority
existing_page.hashtags = list(set(
existing_page.hashtags + fresh_page.hashtags))
existing_page.hops_off_surt = min(
existing_page.hops_off_surt, fresh_page.hops_off_surt)
existing_page.hops_off = min(
existing_page.hops_off, fresh_page.hops_off)
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
'''
@ -337,9 +337,9 @@ class RethinkDbFrontier:
if decision is True:
hops_off = 0
elif decision is None:
decision = parent_page.hops_off_surt < site.scope.get(
decision = parent_page.hops_off < site.scope.get(
'max_hops_off_surt', 0)
hops_off = parent_page.hops_off_surt + 1
hops_off = parent_page.hops_off + 1
if decision is True:
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
fresh_page = self._build_fresh_page(

View File

@ -99,7 +99,7 @@ def new_job(frontier, job_conf):
def new_site(frontier, site):
site.id = str(uuid.uuid4())
logging.info("new site {}".format(site))
logging.info("new site %s", site)
# insert the Page into the database before the Site, to avoid situation
# where a brozzler worker immediately claims the site, finds no pages
# to crawl, and decides the site is finished
@ -291,8 +291,12 @@ class Page(doublethink.Document):
self.brozzle_count = 0
if not "claimed" in self:
self.claimed = False
if not "hops_off_surt" in self:
self.hops_off_surt = 0
if "hops_off_surt" in self and not "hops_off" in self:
self.hops_off = self.hops_off_surt
if "hops_off_surt" in self:
del self["hops_off_surt"]
if not "hops_off" in self:
self.hops_off = 0
if not "needs_robots_check" in self:
self.needs_robots_check = False
if not "priority" in self:

View File

@ -106,7 +106,7 @@ def test_basics():
'brozzle_count': 0,
'claimed': False,
'hops_from_seed': 0,
'hops_off_surt': 0,
'hops_off': 0,
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
'job_id': job.id,
'needs_robots_check': True,
@ -120,7 +120,7 @@ def test_basics():
'brozzle_count': 0,
'claimed': False,
'hops_from_seed': 0,
'hops_off_surt': 0,
'hops_off': 0,
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
'job_id': job.id,
'needs_robots_check': True,
@ -907,7 +907,7 @@ def test_choose_warcprox():
svcreg = doublethink.ServiceRegistry(rr)
frontier = brozzler.RethinkDbFrontier(rr)
# avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
# avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
rr.table('sites').wait().run()
rr.table('services').wait().run()
rr.table('sites').index_wait().run()