diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 628ff32..772fca3 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -261,8 +261,7 @@ class RethinkDbFrontier: site.save() def scope_and_schedule_outlinks(self, site, parent_page, outlinks): - if site.remember_outlinks: - decisions = {"accepted":set(),"blocked":set(),"rejected":set()} + decisions = {"accepted":set(),"blocked":set(),"rejected":set()} counts = {"added":0,"updated":0,"rejected":0,"blocked":0} for url in outlinks or []: url_for_scoping = urlcanon.semantic(url) @@ -289,22 +288,18 @@ class RethinkDbFrontier: else: new_child_page.save() counts["added"] += 1 - if site.remember_outlinks: - decisions["accepted"].add(str(url_for_crawling)) + decisions["accepted"].add(str(url_for_crawling)) else: counts["blocked"] += 1 - if site.remember_outlinks: - decisions["blocked"].add(str(url_for_crawling)) + decisions["blocked"].add(str(url_for_crawling)) else: counts["rejected"] += 1 - if site.remember_outlinks: - decisions["rejected"].add(str(url_for_crawling)) + decisions["rejected"].add(str(url_for_crawling)) - if site.remember_outlinks: - parent_page.outlinks = {} - for k in decisions: - parent_page.outlinks[k] = list(decisions[k]) - parent_page.save() + parent_page.outlinks = {} + for k in decisions: + parent_page.outlinks[k] = list(decisions[k]) + parent_page.save() self.logger.info( "%s new links added, %s existing links updated, %s links " diff --git a/brozzler/job_schema.yaml b/brozzler/job_schema.yaml index d428bac..9a2ba58 100644 --- a/brozzler/job_schema.yaml +++ b/brozzler/job_schema.yaml @@ -75,6 +75,7 @@ id: max_hops_off_surt: type: integer + # ignored, left for backward compatibility remember_outlinks: type: boolean diff --git a/setup.py b/setup.py index 3d56d2f..3353b9e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev206', + version='1.1b9.dev207', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 1601934..d276439 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -252,8 +252,7 @@ def test_field_defaults(): def test_scope_and_schedule_outlinks(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) - site = brozzler.Site(rr, { - 'seed':'http://example.com/', 'remember_outlinks':True}) + site = brozzler.Site(rr, {'seed':'http://example.com/'}) parent_page = brozzler.Page(rr, { 'hops_from_seed': 1, 'url': 'http://example.com/whatever'}) outlinks = [