From 6c81b40e282fcdc1c57f644b4ee0173ad006ddc5 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 16 Mar 2017 12:12:33 -0700 Subject: [PATCH] if parent page has a redirect_url, check scope rules both with the parent_page original url and with the redirect url, with automated tests --- brozzler/site.py | 25 +++++++--- setup.py | 2 +- tests/test_frontier.py | 106 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 125 insertions(+), 8 deletions(-) diff --git a/brozzler/site.py b/brozzler/site.py index 2285346..a61ec59 100644 --- a/brozzler/site.py +++ b/brozzler/site.py @@ -93,8 +93,12 @@ class Site(doublethink.Document): def is_in_scope(self, url, parent_page=None): if not isinstance(url, urlcanon.ParsedUrl): url = urlcanon.semantic(url) + try_parent_urls = [] if parent_page: - parent_url = urlcanon.semantic(parent_page.url) + try_parent_urls.append(urlcanon.semantic(parent_page.url)) + if parent_page.redirect_url: + try_parent_urls.append( + urlcanon.semantic(parent_page.redirect_url)) might_accept = False if not url.scheme in (b'http', b'https'): @@ -112,16 +116,25 @@ class Site(doublethink.Document): elif "accepts" in self.scope: for accept_rule in self.scope["accepts"]: rule = urlcanon.MatchRule(**accept_rule) - if rule.applies(url, parent_url): - might_accept = True - break + if try_parent_urls: + for parent_url in try_parent_urls: + if rule.applies(url, parent_url): + might_accept = True + else: + if rule.applies(url): + might_accept = True if might_accept: if "blocks" in self.scope: for block_rule in self.scope["blocks"]: rule = urlcanon.MatchRule(**block_rule) - if rule.applies(url, parent_url): - return False + if try_parent_urls: + for parent_url in try_parent_urls: + if rule.applies(url, parent_url): + return False + else: + if rule.applies(url): + return False return True else: return False diff --git a/setup.py b/setup.py index 82fd043..7368370 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def find_package_data(package): setuptools.setup( name='brozzler', - version='1.1b9.dev204', + version='1.1b9.dev205', description='Distributed web crawling with browsers', url='https://github.com/internetarchive/brozzler', author='Noah Levitt', diff --git a/tests/test_frontier.py b/tests/test_frontier.py index 0e62a97..1601934 100644 --- a/tests/test_frontier.py +++ b/tests/test_frontier.py @@ -249,7 +249,7 @@ def test_field_defaults(): assert kob.id assert kob.starts_and_stops -def test_scope_and_scheduled_outlinks(): +def test_scope_and_schedule_outlinks(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { @@ -287,6 +287,109 @@ def test_scope_and_scheduled_outlinks(): id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id) +def test_parent_url_scoping(): + rr = doublethink.Rethinker('localhost', db='ignoreme') + frontier = brozzler.RethinkDbFrontier(rr) + + # scope rules that look at parent page url should consider both the + # original url and the redirect url, if any, of the parent page + site = brozzler.Site(rr, { + 'seed': 'http://example.com/foo/', + 'scope': { + 'accepts': [{ + 'parent_url_regex': '^http://example.com/acceptme/.*$'}], + 'blocks': [{ + 'parent_url_regex': '^http://example.com/blockme/.*$'}], + }, + 'remember_outlinks': True}) + site.save() + + # an outlink that would not otherwise be in scope + outlinks = ['https://some-random-url.com/'] + + # parent page does not match any parent_url_regex + parent_page = brozzler.Page(rr, { + 'site_id': site.id, + 'url': 'http://example.com/foo/spluh'}) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + assert parent_page.outlinks['rejected'] == outlinks + assert parent_page.outlinks['accepted'] == [] + + # parent page url matches accept parent_url_regex + parent_page = brozzler.Page(rr, { + 'site_id': site.id, + 'url': 'http://example.com/acceptme/futz'}) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + assert parent_page.outlinks['rejected'] == [] + assert parent_page.outlinks['accepted'] == outlinks + + # parent page redirect_url matches accept parent_url_regex + parent_page_c = brozzler.Page(rr, { + 'site_id': site.id, + 'url': 'http://example.com/toot/blah', + 'redirect_url':'http://example.com/acceptme/futz'}) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + assert parent_page.outlinks['rejected'] == [] + assert parent_page.outlinks['accepted'] == outlinks + + # an outlink that would normally be in scope + outlinks = ['http://example.com/foo/whatever/'] + + # parent page does not match any parent_url_regex + parent_page = brozzler.Page(rr, { + 'site_id': site.id, + 'url': 'http://example.com/foo/spluh'}) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + assert parent_page.outlinks['rejected'] == [] + assert parent_page.outlinks['accepted'] == outlinks + + # parent page url matches block parent_url_regex + parent_page = brozzler.Page(rr, { + 'site_id': site.id, + 'url': 'http://example.com/blockme/futz'}) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + assert parent_page.outlinks['rejected'] == outlinks + assert parent_page.outlinks['accepted'] == [] + + # parent page redirect_url matches block parent_url_regex + parent_page_c = brozzler.Page(rr, { + 'site_id': site.id, + 'url': 'http://example.com/toot/blah', + 'redirect_url':'http://example.com/blockme/futz'}) + orig_is_permitted_by_robots = brozzler.is_permitted_by_robots + brozzler.is_permitted_by_robots = lambda *args: True + try: + frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) + finally: + brozzler.is_permitted_by_robots = orig_is_permitted_by_robots + assert parent_page.outlinks['rejected'] == outlinks + assert parent_page.outlinks['accepted'] == [] + def test_completed_page(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) @@ -357,3 +460,4 @@ def test_completed_page(): page.refresh() assert page.brozzle_count == 1 assert page.claimed == False +