always save outlinks info on rethinkdb page object, get rid of 'remember_outlinks' option, to keep config simple, and because it's not a very expensive thing

This commit is contained in:
Noah Levitt 2017-03-17 10:04:10 -07:00
parent 701f7654a8
commit 0685c77d01
4 changed files with 11 additions and 16 deletions

View file

@ -261,7 +261,6 @@ class RethinkDbFrontier:
site.save() site.save()
def scope_and_schedule_outlinks(self, site, parent_page, outlinks): def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
if site.remember_outlinks:
decisions = {"accepted":set(),"blocked":set(),"rejected":set()} decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
counts = {"added":0,"updated":0,"rejected":0,"blocked":0} counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
for url in outlinks or []: for url in outlinks or []:
@ -289,18 +288,14 @@ class RethinkDbFrontier:
else: else:
new_child_page.save() new_child_page.save()
counts["added"] += 1 counts["added"] += 1
if site.remember_outlinks:
decisions["accepted"].add(str(url_for_crawling)) decisions["accepted"].add(str(url_for_crawling))
else: else:
counts["blocked"] += 1 counts["blocked"] += 1
if site.remember_outlinks:
decisions["blocked"].add(str(url_for_crawling)) decisions["blocked"].add(str(url_for_crawling))
else: else:
counts["rejected"] += 1 counts["rejected"] += 1
if site.remember_outlinks:
decisions["rejected"].add(str(url_for_crawling)) decisions["rejected"].add(str(url_for_crawling))
if site.remember_outlinks:
parent_page.outlinks = {} parent_page.outlinks = {}
for k in decisions: for k in decisions:
parent_page.outlinks[k] = list(decisions[k]) parent_page.outlinks[k] = list(decisions[k])

View file

@ -75,6 +75,7 @@ id:
max_hops_off_surt: max_hops_off_surt:
type: integer type: integer
# ignored, left for backward compatibility
remember_outlinks: remember_outlinks:
type: boolean type: boolean

View file

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev206', version='1.1b9.dev207',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View file

@ -252,8 +252,7 @@ def test_field_defaults():
def test_scope_and_schedule_outlinks(): def test_scope_and_schedule_outlinks():
rr = doublethink.Rethinker('localhost', db='ignoreme') rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
site = brozzler.Site(rr, { site = brozzler.Site(rr, {'seed':'http://example.com/'})
'seed':'http://example.com/', 'remember_outlinks':True})
parent_page = brozzler.Page(rr, { parent_page = brozzler.Page(rr, {
'hops_from_seed': 1, 'url': 'http://example.com/whatever'}) 'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
outlinks = [ outlinks = [