mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
if parent page has a redirect_url, check scope rules both with the parent_page original url and with the redirect url, with automated tests
This commit is contained in:
parent
0021a9d5f0
commit
6c81b40e28
@ -93,8 +93,12 @@ class Site(doublethink.Document):
|
||||
def is_in_scope(self, url, parent_page=None):
|
||||
if not isinstance(url, urlcanon.ParsedUrl):
|
||||
url = urlcanon.semantic(url)
|
||||
try_parent_urls = []
|
||||
if parent_page:
|
||||
parent_url = urlcanon.semantic(parent_page.url)
|
||||
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
||||
if parent_page.redirect_url:
|
||||
try_parent_urls.append(
|
||||
urlcanon.semantic(parent_page.redirect_url))
|
||||
|
||||
might_accept = False
|
||||
if not url.scheme in (b'http', b'https'):
|
||||
@ -112,16 +116,25 @@ class Site(doublethink.Document):
|
||||
elif "accepts" in self.scope:
|
||||
for accept_rule in self.scope["accepts"]:
|
||||
rule = urlcanon.MatchRule(**accept_rule)
|
||||
if rule.applies(url, parent_url):
|
||||
might_accept = True
|
||||
break
|
||||
if try_parent_urls:
|
||||
for parent_url in try_parent_urls:
|
||||
if rule.applies(url, parent_url):
|
||||
might_accept = True
|
||||
else:
|
||||
if rule.applies(url):
|
||||
might_accept = True
|
||||
|
||||
if might_accept:
|
||||
if "blocks" in self.scope:
|
||||
for block_rule in self.scope["blocks"]:
|
||||
rule = urlcanon.MatchRule(**block_rule)
|
||||
if rule.applies(url, parent_url):
|
||||
return False
|
||||
if try_parent_urls:
|
||||
for parent_url in try_parent_urls:
|
||||
if rule.applies(url, parent_url):
|
||||
return False
|
||||
else:
|
||||
if rule.applies(url):
|
||||
return False
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
||||
|
||||
setuptools.setup(
|
||||
name='brozzler',
|
||||
version='1.1b9.dev204',
|
||||
version='1.1b9.dev205',
|
||||
description='Distributed web crawling with browsers',
|
||||
url='https://github.com/internetarchive/brozzler',
|
||||
author='Noah Levitt',
|
||||
|
@ -249,7 +249,7 @@ def test_field_defaults():
|
||||
assert kob.id
|
||||
assert kob.starts_and_stops
|
||||
|
||||
def test_scope_and_scheduled_outlinks():
|
||||
def test_scope_and_schedule_outlinks():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
site = brozzler.Site(rr, {
|
||||
@ -287,6 +287,109 @@ def test_scope_and_scheduled_outlinks():
|
||||
id = brozzler.Page.compute_id(site.id, url)
|
||||
assert brozzler.Page.load(rr, id)
|
||||
|
||||
def test_parent_url_scoping():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
|
||||
# scope rules that look at parent page url should consider both the
|
||||
# original url and the redirect url, if any, of the parent page
|
||||
site = brozzler.Site(rr, {
|
||||
'seed': 'http://example.com/foo/',
|
||||
'scope': {
|
||||
'accepts': [{
|
||||
'parent_url_regex': '^http://example.com/acceptme/.*$'}],
|
||||
'blocks': [{
|
||||
'parent_url_regex': '^http://example.com/blockme/.*$'}],
|
||||
},
|
||||
'remember_outlinks': True})
|
||||
site.save()
|
||||
|
||||
# an outlink that would not otherwise be in scope
|
||||
outlinks = ['https://some-random-url.com/']
|
||||
|
||||
# parent page does not match any parent_url_regex
|
||||
parent_page = brozzler.Page(rr, {
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/foo/spluh'})
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
assert parent_page.outlinks['rejected'] == outlinks
|
||||
assert parent_page.outlinks['accepted'] == []
|
||||
|
||||
# parent page url matches accept parent_url_regex
|
||||
parent_page = brozzler.Page(rr, {
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/acceptme/futz'})
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
assert parent_page.outlinks['rejected'] == []
|
||||
assert parent_page.outlinks['accepted'] == outlinks
|
||||
|
||||
# parent page redirect_url matches accept parent_url_regex
|
||||
parent_page_c = brozzler.Page(rr, {
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/toot/blah',
|
||||
'redirect_url':'http://example.com/acceptme/futz'})
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
assert parent_page.outlinks['rejected'] == []
|
||||
assert parent_page.outlinks['accepted'] == outlinks
|
||||
|
||||
# an outlink that would normally be in scope
|
||||
outlinks = ['http://example.com/foo/whatever/']
|
||||
|
||||
# parent page does not match any parent_url_regex
|
||||
parent_page = brozzler.Page(rr, {
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/foo/spluh'})
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
assert parent_page.outlinks['rejected'] == []
|
||||
assert parent_page.outlinks['accepted'] == outlinks
|
||||
|
||||
# parent page url matches block parent_url_regex
|
||||
parent_page = brozzler.Page(rr, {
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/blockme/futz'})
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
assert parent_page.outlinks['rejected'] == outlinks
|
||||
assert parent_page.outlinks['accepted'] == []
|
||||
|
||||
# parent page redirect_url matches block parent_url_regex
|
||||
parent_page_c = brozzler.Page(rr, {
|
||||
'site_id': site.id,
|
||||
'url': 'http://example.com/toot/blah',
|
||||
'redirect_url':'http://example.com/blockme/futz'})
|
||||
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||
brozzler.is_permitted_by_robots = lambda *args: True
|
||||
try:
|
||||
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||
finally:
|
||||
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||
assert parent_page.outlinks['rejected'] == outlinks
|
||||
assert parent_page.outlinks['accepted'] == []
|
||||
|
||||
def test_completed_page():
|
||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||
frontier = brozzler.RethinkDbFrontier(rr)
|
||||
@ -357,3 +460,4 @@ def test_completed_page():
|
||||
page.refresh()
|
||||
assert page.brozzle_count == 1
|
||||
assert page.claimed == False
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user