Merge branch 'master' into qa

* master:
  always save outlinks info on rethinkdb page object, get rid of 'remember_outlinks' option, to keep config simple, and because it's not a very expensive thing
This commit is contained in:
Noah Levitt 2017-03-17 10:04:18 -07:00
commit 775bfb123f
4 changed files with 11 additions and 16 deletions

View File

@ -261,8 +261,7 @@ class RethinkDbFrontier:
site.save()
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
if site.remember_outlinks:
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
for url in outlinks or []:
url_for_scoping = urlcanon.semantic(url)
@ -289,22 +288,18 @@ class RethinkDbFrontier:
else:
new_child_page.save()
counts["added"] += 1
if site.remember_outlinks:
decisions["accepted"].add(str(url_for_crawling))
decisions["accepted"].add(str(url_for_crawling))
else:
counts["blocked"] += 1
if site.remember_outlinks:
decisions["blocked"].add(str(url_for_crawling))
decisions["blocked"].add(str(url_for_crawling))
else:
counts["rejected"] += 1
if site.remember_outlinks:
decisions["rejected"].add(str(url_for_crawling))
decisions["rejected"].add(str(url_for_crawling))
if site.remember_outlinks:
parent_page.outlinks = {}
for k in decisions:
parent_page.outlinks[k] = list(decisions[k])
parent_page.save()
parent_page.outlinks = {}
for k in decisions:
parent_page.outlinks[k] = list(decisions[k])
parent_page.save()
self.logger.info(
"%s new links added, %s existing links updated, %s links "

View File

@ -75,6 +75,7 @@ id:
max_hops_off_surt:
type: integer
# ignored, left for backward compatibility
remember_outlinks:
type: boolean

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup(
name='brozzler',
version='1.1b9.dev206',
version='1.1b9.dev207',
description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler',
author='Noah Levitt',

View File

@ -252,8 +252,7 @@ def test_field_defaults():
def test_scope_and_schedule_outlinks():
rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
site = brozzler.Site(rr, {
'seed':'http://example.com/', 'remember_outlinks':True})
site = brozzler.Site(rr, {'seed':'http://example.com/'})
parent_page = brozzler.Page(rr, {
'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
outlinks = [