if parent page has a redirect_url, check scope rules both with the parent_page original url and with the redirect url, with automated tests

This commit is contained in:
Noah Levitt 2017-03-16 12:12:33 -07:00
parent 0021a9d5f0
commit 6c81b40e28
3 changed files with 125 additions and 8 deletions

View File

@ -93,8 +93,12 @@ class Site(doublethink.Document):
def is_in_scope(self, url, parent_page=None): def is_in_scope(self, url, parent_page=None):
if not isinstance(url, urlcanon.ParsedUrl): if not isinstance(url, urlcanon.ParsedUrl):
url = urlcanon.semantic(url) url = urlcanon.semantic(url)
try_parent_urls = []
if parent_page: if parent_page:
parent_url = urlcanon.semantic(parent_page.url) try_parent_urls.append(urlcanon.semantic(parent_page.url))
if parent_page.redirect_url:
try_parent_urls.append(
urlcanon.semantic(parent_page.redirect_url))
might_accept = False might_accept = False
if not url.scheme in (b'http', b'https'): if not url.scheme in (b'http', b'https'):
@ -112,16 +116,25 @@ class Site(doublethink.Document):
elif "accepts" in self.scope: elif "accepts" in self.scope:
for accept_rule in self.scope["accepts"]: for accept_rule in self.scope["accepts"]:
rule = urlcanon.MatchRule(**accept_rule) rule = urlcanon.MatchRule(**accept_rule)
if try_parent_urls:
for parent_url in try_parent_urls:
if rule.applies(url, parent_url): if rule.applies(url, parent_url):
might_accept = True might_accept = True
break else:
if rule.applies(url):
might_accept = True
if might_accept: if might_accept:
if "blocks" in self.scope: if "blocks" in self.scope:
for block_rule in self.scope["blocks"]: for block_rule in self.scope["blocks"]:
rule = urlcanon.MatchRule(**block_rule) rule = urlcanon.MatchRule(**block_rule)
if try_parent_urls:
for parent_url in try_parent_urls:
if rule.applies(url, parent_url): if rule.applies(url, parent_url):
return False return False
else:
if rule.applies(url):
return False
return True return True
else: else:
return False return False

View File

@ -32,7 +32,7 @@ def find_package_data(package):
setuptools.setup( setuptools.setup(
name='brozzler', name='brozzler',
version='1.1b9.dev204', version='1.1b9.dev205',
description='Distributed web crawling with browsers', description='Distributed web crawling with browsers',
url='https://github.com/internetarchive/brozzler', url='https://github.com/internetarchive/brozzler',
author='Noah Levitt', author='Noah Levitt',

View File

@ -249,7 +249,7 @@ def test_field_defaults():
assert kob.id assert kob.id
assert kob.starts_and_stops assert kob.starts_and_stops
def test_scope_and_scheduled_outlinks(): def test_scope_and_schedule_outlinks():
rr = doublethink.Rethinker('localhost', db='ignoreme') rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
site = brozzler.Site(rr, { site = brozzler.Site(rr, {
@ -287,6 +287,109 @@ def test_scope_and_scheduled_outlinks():
id = brozzler.Page.compute_id(site.id, url) id = brozzler.Page.compute_id(site.id, url)
assert brozzler.Page.load(rr, id) assert brozzler.Page.load(rr, id)
def test_parent_url_scoping():
rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr)
# scope rules that look at parent page url should consider both the
# original url and the redirect url, if any, of the parent page
site = brozzler.Site(rr, {
'seed': 'http://example.com/foo/',
'scope': {
'accepts': [{
'parent_url_regex': '^http://example.com/acceptme/.*$'}],
'blocks': [{
'parent_url_regex': '^http://example.com/blockme/.*$'}],
},
'remember_outlinks': True})
site.save()
# an outlink that would not otherwise be in scope
outlinks = ['https://some-random-url.com/']
# parent page does not match any parent_url_regex
parent_page = brozzler.Page(rr, {
'site_id': site.id,
'url': 'http://example.com/foo/spluh'})
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
brozzler.is_permitted_by_robots = lambda *args: True
try:
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
assert parent_page.outlinks['rejected'] == outlinks
assert parent_page.outlinks['accepted'] == []
# parent page url matches accept parent_url_regex
parent_page = brozzler.Page(rr, {
'site_id': site.id,
'url': 'http://example.com/acceptme/futz'})
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
brozzler.is_permitted_by_robots = lambda *args: True
try:
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
assert parent_page.outlinks['rejected'] == []
assert parent_page.outlinks['accepted'] == outlinks
# parent page redirect_url matches accept parent_url_regex
parent_page_c = brozzler.Page(rr, {
'site_id': site.id,
'url': 'http://example.com/toot/blah',
'redirect_url':'http://example.com/acceptme/futz'})
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
brozzler.is_permitted_by_robots = lambda *args: True
try:
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
assert parent_page.outlinks['rejected'] == []
assert parent_page.outlinks['accepted'] == outlinks
# an outlink that would normally be in scope
outlinks = ['http://example.com/foo/whatever/']
# parent page does not match any parent_url_regex
parent_page = brozzler.Page(rr, {
'site_id': site.id,
'url': 'http://example.com/foo/spluh'})
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
brozzler.is_permitted_by_robots = lambda *args: True
try:
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
assert parent_page.outlinks['rejected'] == []
assert parent_page.outlinks['accepted'] == outlinks
# parent page url matches block parent_url_regex
parent_page = brozzler.Page(rr, {
'site_id': site.id,
'url': 'http://example.com/blockme/futz'})
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
brozzler.is_permitted_by_robots = lambda *args: True
try:
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
assert parent_page.outlinks['rejected'] == outlinks
assert parent_page.outlinks['accepted'] == []
# parent page redirect_url matches block parent_url_regex
parent_page_c = brozzler.Page(rr, {
'site_id': site.id,
'url': 'http://example.com/toot/blah',
'redirect_url':'http://example.com/blockme/futz'})
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
brozzler.is_permitted_by_robots = lambda *args: True
try:
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
finally:
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
assert parent_page.outlinks['rejected'] == outlinks
assert parent_page.outlinks['accepted'] == []
def test_completed_page(): def test_completed_page():
rr = doublethink.Rethinker('localhost', db='ignoreme') rr = doublethink.Rethinker('localhost', db='ignoreme')
frontier = brozzler.RethinkDbFrontier(rr) frontier = brozzler.RethinkDbFrontier(rr)
@ -357,3 +460,4 @@ def test_completed_page():
page.refresh() page.refresh()
assert page.brozzle_count == 1 assert page.brozzle_count == 1
assert page.claimed == False assert page.claimed == False