mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-19 20:34:09 -04:00
rename page.hops_off_surt to page.hops_off
This commit is contained in:
parent
60f2b99cc0
commit
b83d3cb9df
3 changed files with 14 additions and 10 deletions
|
@ -315,8 +315,8 @@ class RethinkDbFrontier:
|
||||||
existing_page.priority += fresh_page.priority
|
existing_page.priority += fresh_page.priority
|
||||||
existing_page.hashtags = list(set(
|
existing_page.hashtags = list(set(
|
||||||
existing_page.hashtags + fresh_page.hashtags))
|
existing_page.hashtags + fresh_page.hashtags))
|
||||||
existing_page.hops_off_surt = min(
|
existing_page.hops_off = min(
|
||||||
existing_page.hops_off_surt, fresh_page.hops_off_surt)
|
existing_page.hops_off, fresh_page.hops_off)
|
||||||
|
|
||||||
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
def _scope_and_enforce_robots(self, site, parent_page, outlinks):
|
||||||
'''
|
'''
|
||||||
|
@ -337,9 +337,9 @@ class RethinkDbFrontier:
|
||||||
if decision is True:
|
if decision is True:
|
||||||
hops_off = 0
|
hops_off = 0
|
||||||
elif decision is None:
|
elif decision is None:
|
||||||
decision = parent_page.hops_off_surt < site.scope.get(
|
decision = parent_page.hops_off < site.scope.get(
|
||||||
'max_hops_off_surt', 0)
|
'max_hops_off_surt', 0)
|
||||||
hops_off = parent_page.hops_off_surt + 1
|
hops_off = parent_page.hops_off + 1
|
||||||
if decision is True:
|
if decision is True:
|
||||||
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
if brozzler.is_permitted_by_robots(site, str(url_for_crawling)):
|
||||||
fresh_page = self._build_fresh_page(
|
fresh_page = self._build_fresh_page(
|
||||||
|
|
|
@ -99,7 +99,7 @@ def new_job(frontier, job_conf):
|
||||||
|
|
||||||
def new_site(frontier, site):
|
def new_site(frontier, site):
|
||||||
site.id = str(uuid.uuid4())
|
site.id = str(uuid.uuid4())
|
||||||
logging.info("new site {}".format(site))
|
logging.info("new site %s", site)
|
||||||
# insert the Page into the database before the Site, to avoid situation
|
# insert the Page into the database before the Site, to avoid situation
|
||||||
# where a brozzler worker immediately claims the site, finds no pages
|
# where a brozzler worker immediately claims the site, finds no pages
|
||||||
# to crawl, and decides the site is finished
|
# to crawl, and decides the site is finished
|
||||||
|
@ -291,8 +291,12 @@ class Page(doublethink.Document):
|
||||||
self.brozzle_count = 0
|
self.brozzle_count = 0
|
||||||
if not "claimed" in self:
|
if not "claimed" in self:
|
||||||
self.claimed = False
|
self.claimed = False
|
||||||
if not "hops_off_surt" in self:
|
if "hops_off_surt" in self and not "hops_off" in self:
|
||||||
self.hops_off_surt = 0
|
self.hops_off = self.hops_off_surt
|
||||||
|
if "hops_off_surt" in self:
|
||||||
|
del self["hops_off_surt"]
|
||||||
|
if not "hops_off" in self:
|
||||||
|
self.hops_off = 0
|
||||||
if not "needs_robots_check" in self:
|
if not "needs_robots_check" in self:
|
||||||
self.needs_robots_check = False
|
self.needs_robots_check = False
|
||||||
if not "priority" in self:
|
if not "priority" in self:
|
||||||
|
|
|
@ -106,7 +106,7 @@ def test_basics():
|
||||||
'brozzle_count': 0,
|
'brozzle_count': 0,
|
||||||
'claimed': False,
|
'claimed': False,
|
||||||
'hops_from_seed': 0,
|
'hops_from_seed': 0,
|
||||||
'hops_off_surt': 0,
|
'hops_off': 0,
|
||||||
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'),
|
||||||
'job_id': job.id,
|
'job_id': job.id,
|
||||||
'needs_robots_check': True,
|
'needs_robots_check': True,
|
||||||
|
@ -120,7 +120,7 @@ def test_basics():
|
||||||
'brozzle_count': 0,
|
'brozzle_count': 0,
|
||||||
'claimed': False,
|
'claimed': False,
|
||||||
'hops_from_seed': 0,
|
'hops_from_seed': 0,
|
||||||
'hops_off_surt': 0,
|
'hops_off': 0,
|
||||||
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'),
|
||||||
'job_id': job.id,
|
'job_id': job.id,
|
||||||
'needs_robots_check': True,
|
'needs_robots_check': True,
|
||||||
|
@ -907,7 +907,7 @@ def test_choose_warcprox():
|
||||||
svcreg = doublethink.ServiceRegistry(rr)
|
svcreg = doublethink.ServiceRegistry(rr)
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
# avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
# avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
|
||||||
rr.table('sites').wait().run()
|
rr.table('sites').wait().run()
|
||||||
rr.table('services').wait().run()
|
rr.table('services').wait().run()
|
||||||
rr.table('sites').index_wait().run()
|
rr.table('sites').index_wait().run()
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue