mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-23 16:19:49 -05:00
rename page.hops_off_surt to page.hops_off
This commit is contained in:
parent
60f2b99cc0
commit
b83d3cb9df
@ -315,8 +315,8 @@ class RethinkDbFrontier:
|
||||
existing_page.priority += fresh_page.priority
|
||||
existing_page.hashtags = list(set(
|
||||
existing_page.hashtags + fresh_page.hashtags))
|
||||
existing_page.hops_off_surt = min(
|
||||
existing_page.hops_off_surt, fresh_page.hops_off_surt)
|
||||
existing_page.hops_off = min(
|
||||
existing_page.hops_off, fresh_page.hops_off)
|
||||
|
||||
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
||||
'''
|
||||
@ -337,9 +337,9 @@ class RethinkDbFrontier:
|
||||
if decision is True:
|
||||
hops_off = 0
|
||||
elif decision is None:
|
||||
decision = parent_page.hops_off_surt < site.scope.get(
|
||||
decision = parent_page.hops_off < site.scope.get(
|
||||
'max_hops_off_surt', 0)
|
||||
hops_off = parent_page.hops_off_surt + 1
|
||||
hops_off = parent_page.hops_off + 1
|
||||
if decision is True:
|
||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||
fresh_page = self._build_fresh_page(
|
||||
|
@ -99,7 +99,7 @@ def new_job(frontier, job_conf):
|
||||
|
||||
def new_site(frontier, site):
|
||||
site.id = str(uuid.uuid4())
|
||||
logging.info("new site {}".format(site))
|
||||
logging.info("new site %s", site)
|
||||
# insert the Page into the database before the Site, to avoid situation
|
||||
# where a brozzler worker immediately claims the site, finds no pages
|
||||
# to crawl, and decides the site is finished
|
||||
@ -291,8 +291,12 @@ class Page(doublethink.Document):
|
||||
self.brozzle_count = 0
|
||||
if not "claimed" in self:
|
||||
self.claimed = False
|
||||
if not "hops_off_surt" in self:
|
||||
self.hops_off_surt = 0
|
||||
if "hops_off_surt" in self and not "hops_off" in self:
|
||||
self.hops_off = self.hops_off_surt
|
||||
if "hops_off_surt" in self:
|
||||
del self["hops_off_surt"]
|
||||
if not "hops_off" in self:
|
||||
self.hops_off = 0
|
||||
if not "needs_robots_check" in self:
|
||||
self.needs_robots_check = False
|
||||
if not "priority" in self:
|
||||
|
@ -106,7 +106,7 @@ def test_basics():
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hops_from_seed': 0,
|
||||
'hops_off_surt': 0,
|
||||
'hops_off': 0,
|
||||
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
||||
'job_id': job.id,
|
||||
'needs_robots_check': True,
|
||||
@ -120,7 +120,7 @@ def test_basics():
|
||||
'brozzle_count': 0,
|
||||
'claimed': False,
|
||||
'hops_from_seed': 0,
|
||||
'hops_off_surt': 0,
|
||||
'hops_off': 0,
|
||||
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
||||
'job_id': job.id,
|
||||
'needs_robots_check': True,
|
||||
@ -907,7 +907,7 @@ def test_choose_warcprox():
|
||||
svcreg = doublethink.ServiceRegistry(rr)
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
||||
# avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
||||
rr.table('sites').wait().run()
|
||||
rr.table('services').wait().run()
|
||||
rr.table('sites').index_wait().run()
|
||||
|
Loading…
x
Reference in New Issue
Block a user