From b83d3cb9df08099952c83ccd66cd85c7670c64c4 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 22 Mar 2018 17:07:52 -0700 Subject: [PATCH] rename page.hops_off_surt to page.hops_off --- brozzler/frontier.py | 8 ++++---- brozzler/model.py | 10 +++++++--- tests/test_frontier.py | 6 +++--- 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 5276f72..b9785bc 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -315,8 +315,8 @@ class RethinkDbFrontier: existing_page.priority += fresh_page.priority existing_page.hashtags = list(set( existing_page.hashtags + fresh_page.hashtags)) - existing_page.hops_off_surt = min( - existing_page.hops_off_surt, fresh_page.hops_off_surt) + existing_page.hops_off = min( + existing_page.hops_off, fresh_page.hops_off) def _scope_and_enforce_robots(self, site, parent_page, outlinks): ''' @@ -337,9 +337,9 @@ class RethinkDbFrontier: if decision is True: hops_off = 0 elif decision is None: - decision = parent_page.hops_off_surt < site.scope.get( + decision = parent_page.hops_off < site.scope.get( 'max_hops_off_surt', 0) - hops_off = parent_page.hops_off_surt + 1 + hops_off = parent_page.hops_off + 1 if decision is True: if brozzler.is_permitted_by_robots(site, str(url_for_crawling)): fresh_page = self._build_fresh_page( diff --git a/brozzler/model.py b/brozzler/model.py index 6b35bc2..9c6f482 100644 --- a/brozzler/model.py +++ b/brozzler/model.py @@ -99,7 +99,7 @@ def new_job(frontier, job_conf): def new_site(frontier, site): site.id = str(uuid.uuid4()) - logging.info("new site {}".format(site)) + logging.info("new site %s", site) # insert the Page into the database before the Site, to avoid situation # where a brozzler worker immediately claims the site, finds no pages # to crawl, and decides the site is finished @@ -291,8 +291,12 @@ class Page(doublethink.Document): self.brozzle_count = 0 if not "claimed" in self: self.claimed = False - if not "hops_off_surt" in self: - self.hops_off_surt = 0 + if "hops_off_surt" in self and not "hops_off" in self: + self.hops_off = self.hops_off_surt + if "hops_off_surt" in self: + del self["hops_off_surt"] + if not "hops_off" in self: + self.hops_off = 0 if not "needs_robots_check" in self: self.needs_robots_check = False if not "priority" in self: diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 669f0ec..97c4e83 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -106,7 +106,7 @@ def test_basics(): 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, - 'hops_off_surt': 0, + 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), 'job_id': job.id, 'needs_robots_check': True, @@ -120,7 +120,7 @@ def test_basics(): 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, - 'hops_off_surt': 0, + 'hops_off': 0, 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), 'job_id': job.id, 'needs_robots_check': True, @@ -907,7 +907,7 @@ def test_choose_warcprox(): svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) - # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 + # avoid this error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run()