mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-04-19 15:25:59 -04:00
always save outlinks info on rethinkdb page object, get rid of 'remember_outlinks' option, to keep config simple, and because it's not a very expensive thing
This commit is contained in:
parent
701f7654a8
commit
0685c77d01
@ -261,8 +261,7 @@ class RethinkDbFrontier:
|
||||
site.save()
|
||||
|
||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||
if site.remember_outlinks:
|
||||
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
|
||||
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
|
||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||
for url in outlinks or []:
|
||||
url_for_scoping = urlcanon.semantic(url)
|
||||
@ -289,22 +288,18 @@ class RethinkDbFrontier:
|
||||
else:
|
||||
new_child_page.save()
|
||||
counts["added"] += 1
|
||||
if site.remember_outlinks:
|
||||
decisions["accepted"].add(str(url_for_crawling))
|
||||
decisions["accepted"].add(str(url_for_crawling))
|
||||
else:
|
||||
counts["blocked"] += 1
|
||||
if site.remember_outlinks:
|
||||
decisions["blocked"].add(str(url_for_crawling))
|
||||
decisions["blocked"].add(str(url_for_crawling))
|
||||
else:
|
||||
counts["rejected"] += 1
|
||||
if site.remember_outlinks:
|
||||
decisions["rejected"].add(str(url_for_crawling))
|
||||
decisions["rejected"].add(str(url_for_crawling))
|
||||
|
||||
if site.remember_outlinks:
|
||||
parent_page.outlinks = {}
|
||||
for k in decisions:
|
||||
parent_page.outlinks[k] = list(decisions[k])
|
||||
parent_page.save()
|
||||
parent_page.outlinks = {}
|
||||
for k in decisions:
|
||||
parent_page.outlinks[k] = list(decisions[k])
|
||||
parent_page.save()
|
||||
|
||||
self.logger.info(
|
||||
"%s new links added, %s existing links updated, %s links "
|
||||
|
@ -75,6 +75,7 @@ id:
|
||||
max_hops_off_surt:
|
||||
type: integer
|
||||
|
||||
# ignored, left for backward compatibility
|
||||
remember_outlinks:
|
||||
type: boolean
|
||||
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev206',
|
||||
version='1.1b9.dev207',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -252,8 +252,7 @@ def test_field_defaults():
|
||||
def test_scope_and_schedule_outlinks():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(rr, {
|
||||
'seed':'http://example.com/', 'remember_outlinks':True})
|
||||
site = brozzler.Site(rr, {'seed':'http://example.com/'})
|
||||
parent_page = brozzler.Page(rr, {
|
||||
'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
|
||||
outlinks = [
|
||||
|
Loading…
x
Reference in New Issue
Block a user