mirror of
https://github.com/internetarchive/brozzler.git
synced 2025-02-24 08:39:59 -05:00
if parent page has a redirect_url, check scope rules both with the parent_page original url and with the redirect url, with automated tests
This commit is contained in:
parent
0021a9d5f0
commit
6c81b40e28
@ -93,8 +93,12 @@ class Site(doublethink.Document):
|
|||||||
def is_in_scope(self, url, parent_page=None):
|
def is_in_scope(self, url, parent_page=None):
|
||||||
if not isinstance(url, urlcanon.ParsedUrl):
|
if not isinstance(url, urlcanon.ParsedUrl):
|
||||||
url = urlcanon.semantic(url)
|
url = urlcanon.semantic(url)
|
||||||
|
try_parent_urls = []
|
||||||
if parent_page:
|
if parent_page:
|
||||||
parent_url = urlcanon.semantic(parent_page.url)
|
try_parent_urls.append(urlcanon.semantic(parent_page.url))
|
||||||
|
if parent_page.redirect_url:
|
||||||
|
try_parent_urls.append(
|
||||||
|
urlcanon.semantic(parent_page.redirect_url))
|
||||||
|
|
||||||
might_accept = False
|
might_accept = False
|
||||||
if not url.scheme in (b'http', b'https'):
|
if not url.scheme in (b'http', b'https'):
|
||||||
@ -112,16 +116,25 @@ class Site(doublethink.Document):
|
|||||||
elif "accepts" in self.scope:
|
elif "accepts" in self.scope:
|
||||||
for accept_rule in self.scope["accepts"]:
|
for accept_rule in self.scope["accepts"]:
|
||||||
rule = urlcanon.MatchRule(**accept_rule)
|
rule = urlcanon.MatchRule(**accept_rule)
|
||||||
if rule.applies(url, parent_url):
|
if try_parent_urls:
|
||||||
might_accept = True
|
for parent_url in try_parent_urls:
|
||||||
break
|
if rule.applies(url, parent_url):
|
||||||
|
might_accept = True
|
||||||
|
else:
|
||||||
|
if rule.applies(url):
|
||||||
|
might_accept = True
|
||||||
|
|
||||||
if might_accept:
|
if might_accept:
|
||||||
if "blocks" in self.scope:
|
if "blocks" in self.scope:
|
||||||
for block_rule in self.scope["blocks"]:
|
for block_rule in self.scope["blocks"]:
|
||||||
rule = urlcanon.MatchRule(**block_rule)
|
rule = urlcanon.MatchRule(**block_rule)
|
||||||
if rule.applies(url, parent_url):
|
if try_parent_urls:
|
||||||
return False
|
for parent_url in try_parent_urls:
|
||||||
|
if rule.applies(url, parent_url):
|
||||||
|
return False
|
||||||
|
else:
|
||||||
|
if rule.applies(url):
|
||||||
|
return False
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
return False
|
return False
|
||||||
|
2
setup.py
2
setup.py
@ -32,7 +32,7 @@ def find_package_data(package):
|
|||||||
|
|
||||||
setuptools.setup(
|
setuptools.setup(
|
||||||
name='brozzler',
|
name='brozzler',
|
||||||
version='1.1b9.dev204',
|
version='1.1b9.dev205',
|
||||||
description='Distributed web crawling with browsers',
|
description='Distributed web crawling with browsers',
|
||||||
url='https://github.com/internetarchive/brozzler',
|
url='https://github.com/internetarchive/brozzler',
|
||||||
author='Noah Levitt',
|
author='Noah Levitt',
|
||||||
|
@ -249,7 +249,7 @@ def test_field_defaults():
|
|||||||
assert kob.id
|
assert kob.id
|
||||||
assert kob.starts_and_stops
|
assert kob.starts_and_stops
|
||||||
|
|
||||||
def test_scope_and_scheduled_outlinks():
|
def test_scope_and_schedule_outlinks():
|
||||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
site = brozzler.Site(rr, {
|
site = brozzler.Site(rr, {
|
||||||
@ -287,6 +287,109 @@ def test_scope_and_scheduled_outlinks():
|
|||||||
id = brozzler.Page.compute_id(site.id, url)
|
id = brozzler.Page.compute_id(site.id, url)
|
||||||
assert brozzler.Page.load(rr, id)
|
assert brozzler.Page.load(rr, id)
|
||||||
|
|
||||||
|
def test_parent_url_scoping():
|
||||||
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
|
|
||||||
|
# scope rules that look at parent page url should consider both the
|
||||||
|
# original url and the redirect url, if any, of the parent page
|
||||||
|
site = brozzler.Site(rr, {
|
||||||
|
'seed': 'http://example.com/foo/',
|
||||||
|
'scope': {
|
||||||
|
'accepts': [{
|
||||||
|
'parent_url_regex': '^http://example.com/acceptme/.*$'}],
|
||||||
|
'blocks': [{
|
||||||
|
'parent_url_regex': '^http://example.com/blockme/.*$'}],
|
||||||
|
},
|
||||||
|
'remember_outlinks': True})
|
||||||
|
site.save()
|
||||||
|
|
||||||
|
# an outlink that would not otherwise be in scope
|
||||||
|
outlinks = ['https://some-random-url.com/']
|
||||||
|
|
||||||
|
# parent page does not match any parent_url_regex
|
||||||
|
parent_page = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/foo/spluh'})
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
assert parent_page.outlinks['rejected'] == outlinks
|
||||||
|
assert parent_page.outlinks['accepted'] == []
|
||||||
|
|
||||||
|
# parent page url matches accept parent_url_regex
|
||||||
|
parent_page = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/acceptme/futz'})
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
assert parent_page.outlinks['rejected'] == []
|
||||||
|
assert parent_page.outlinks['accepted'] == outlinks
|
||||||
|
|
||||||
|
# parent page redirect_url matches accept parent_url_regex
|
||||||
|
parent_page_c = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/toot/blah',
|
||||||
|
'redirect_url':'http://example.com/acceptme/futz'})
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
assert parent_page.outlinks['rejected'] == []
|
||||||
|
assert parent_page.outlinks['accepted'] == outlinks
|
||||||
|
|
||||||
|
# an outlink that would normally be in scope
|
||||||
|
outlinks = ['http://example.com/foo/whatever/']
|
||||||
|
|
||||||
|
# parent page does not match any parent_url_regex
|
||||||
|
parent_page = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/foo/spluh'})
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
assert parent_page.outlinks['rejected'] == []
|
||||||
|
assert parent_page.outlinks['accepted'] == outlinks
|
||||||
|
|
||||||
|
# parent page url matches block parent_url_regex
|
||||||
|
parent_page = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/blockme/futz'})
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
assert parent_page.outlinks['rejected'] == outlinks
|
||||||
|
assert parent_page.outlinks['accepted'] == []
|
||||||
|
|
||||||
|
# parent page redirect_url matches block parent_url_regex
|
||||||
|
parent_page_c = brozzler.Page(rr, {
|
||||||
|
'site_id': site.id,
|
||||||
|
'url': 'http://example.com/toot/blah',
|
||||||
|
'redirect_url':'http://example.com/blockme/futz'})
|
||||||
|
orig_is_permitted_by_robots = brozzler.is_permitted_by_robots
|
||||||
|
brozzler.is_permitted_by_robots = lambda *args: True
|
||||||
|
try:
|
||||||
|
frontier.scope_and_schedule_outlinks(site, parent_page, outlinks)
|
||||||
|
finally:
|
||||||
|
brozzler.is_permitted_by_robots = orig_is_permitted_by_robots
|
||||||
|
assert parent_page.outlinks['rejected'] == outlinks
|
||||||
|
assert parent_page.outlinks['accepted'] == []
|
||||||
|
|
||||||
def test_completed_page():
|
def test_completed_page():
|
||||||
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
rr = doublethink.Rethinker('localhost', db='ignoreme')
|
||||||
frontier = brozzler.RethinkDbFrontier(rr)
|
frontier = brozzler.RethinkDbFrontier(rr)
|
||||||
@ -357,3 +460,4 @@ def test_completed_page():
|
|||||||
page.refresh()
|
page.refresh()
|
||||||
assert page.brozzle_count == 1
|
assert page.brozzle_count == 1
|
||||||
assert page.claimed == False
|
assert page.claimed == False
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user