mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-06-20 04:44:12 -04:00
always save outlinks info on rethinkdb page object, get rid of 'remember_outlinks' option, to keep config simple, and because it's not a very expensive thing
This commit is contained in:
parent
701f7654a8
commit
0685c77d01
4 changed files with 11 additions and 16 deletions
|
@ -261,7 +261,6 @@ class RethinkDbFrontier:
|
||||||
site.save()
|
site.save()
|
||||||
|
|
||||||
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
def scope_and_schedule_outlinks(self, site, parent_page, outlinks):
|
||||||
if site.remember_outlinks:
|
|
||||||
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
|
decisions = {"accepted":set(),"blocked":set(),"rejected":set()}
|
||||||
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
counts = {"added":0,"updated":0,"rejected":0,"blocked":0}
|
||||||
for url in outlinks or []:
|
for url in outlinks or []:
|
||||||
|
@ -289,18 +288,14 @@ class RethinkDbFrontier:
|
||||||
else:
|
else:
|
||||||
new_child_page.save()
|
new_child_page.save()
|
||||||
counts["added"] += 1
|
counts["added"] += 1
|
||||||
if site.remember_outlinks:
|
|
||||||
decisions["accepted"].add(str(url_for_crawling))
|
decisions["accepted"].add(str(url_for_crawling))
|
||||||
else:
|
else:
|
||||||
counts["blocked"] += 1
|
counts["blocked"] += 1
|
||||||
if site.remember_outlinks:
|
|
||||||
decisions["blocked"].add(str(url_for_crawling))
|
decisions["blocked"].add(str(url_for_crawling))
|
||||||
else:
|
else:
|
||||||
counts["rejected"] += 1
|
counts["rejected"] += 1
|
||||||
if site.remember_outlinks:
|
|
||||||
decisions["rejected"].add(str(url_for_crawling))
|
decisions["rejected"].add(str(url_for_crawling))
|
||||||
|
|
||||||
if site.remember_outlinks:
|
|
||||||
parent_page.outlinks = {}
|
parent_page.outlinks = {}
|
||||||
for k in decisions:
|
for k in decisions:
|
||||||
parent_page.outlinks[k] = list(decisions[k])
|
parent_page.outlinks[k] = list(decisions[k])
|
||||||
|
|
|
@ -75,6 +75,7 @@ id:
|
||||||
max_hops_off_surt:
|
max_hops_off_surt:
|
||||||
type: integer
|
type: integer
|
||||||
|
|
||||||
|
# ignored, left for backward compatibility
|
||||||
remember_outlinks:
|
remember_outlinks:
|
||||||
type: boolean
|
type: boolean
|
||||||
|
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev206',
|
version='1.1b9.dev207',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
|
|
@ -252,8 +252,7 @@ def test_field_defaults():
|
||||||
def test_scope_and_schedule_outlinks():
|
def test_scope_and_schedule_outlinks():
|
||||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {'seed':'http://example.com/'})
|
||||||
'seed':'http://example.com/', 'remember_outlinks':True})
|
|
||||||
parent_page = brozzler.Page(rr, {
|
parent_page = brozzler.Page(rr, {
|
||||||
'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
|
'hops_from_seed': 1, 'url': 'http://example.com/whatever'})
|
||||||
outlinks = [
|
outlinks = [
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue